# AWS (Amazon Web Services) to CHAT Converter
# Andrew Yankes, first working version June 2021, latest version February 2022
# Tested with Python 3.10.1, not necessarily reliable with earlier versions (even 3.10.0)
#
# This program converts an AWS-generated transcript in .json format to a CHAT file with each word individually timestamped.
# Skip to the bottom to find the relevant command, and hit F5 to run the program after specifying the .json filename.

import json, os

class ConvertJob:

    # The converter assumes two speakers, the English language, and linkage to audio files.
    # It supplies a generic corpus name. But you can change any of these parameters in the executable lines at the end.
    def __init__(self,fullFilename,numspeakers=2,lang='eng',corpus='corpus_name',media='audio'):
        self.oldfile = open(fullFilename,'r')
        self.numspeakers = numspeakers
        self.lang = lang
        self.corpus = corpus
        self.media = media
        self.abbrevFilename = fullFilename[0:fullFilename.find('.')]
        self.newfile = open(self.abbrevFilename + '.cha','w')
        self.timestamps = dict()
        self.initChatFile()
        self.readAWSFile()
        self.writeTranscript()
        self.newfile.close()

    # Writes the headers at the beginning of the CHAT file.
    def initChatFile(self):
        self.newfile.write('@Begin\n')
        self.newfile.write('@Languages:\t%s\n' % self.lang)
        # Writes the @Participants line and each @ID line.
        if self.numspeakers > 0:
            self.newfile.write('@Participants:\t')
            for i in range(self.numspeakers-1):
                self.newfile.write('SPK%d Speaker, ' % i)
            self.newfile.write('SPK%d Speaker\n' % (self.numspeakers-1))
            self.newfile.write('@Options:\tmulti\n')
            for j in range(self.numspeakers):
                self.newfile.write('@ID:\t%s|%s|SPK%d|||||Speaker|||\n' % (self.lang,self.corpus,j))
        self.newfile.write('@Media:\t%s, %s\n' % (self.abbrevFilename,self.media))
            
    def readAWSFile(self):
        oldstring = str(self.oldfile.readlines())
        # First, there's a full transcript at the beginning of the AWS file. But it's useless to us;
        # it has no timestamps, and it'll all be recovered later. Delete it.
        truncatePoint = oldstring.find('"segments":')
        oldstring = oldstring[truncatePoint:]
        # The AWS transcript really reads out twice: once with timestamps mapped to speaker identities,
        # and the second time with the same timestamps mapped out to words. We need to merge these,
        # so we'll find the string below, which marks off the barrier between the two lists.
        # Then we'll divide the string into two components accordingly.
        splitPoint = oldstring.find('}]}]}')
        oldstring1 = oldstring[0:splitPoint]
        oldstring2 = oldstring[splitPoint:]
        # Now initialize some values.
        timeStart,thisSpeaker,timeEnd = '','',''
        spkCursor,wordCursor = 0,0
        # This while-loop will now do the following. It's going to look for each individual starting timestamp in the first
        # string (the one where timestamps are mapped to speaker IDs). It will collect the starting timestamp, the speaker ID,
        # and the ending timestamp all as strings. It will write a new dictionary entry where the speaker ID and ending timestamp
        # are keyed to the starting timestamp.
        while spkCursor <= len(oldstring1):
            spkCursor = oldstring1.find('start_',spkCursor)
            if spkCursor < 0: break
            # Fast-forward us to the next timestamp.
            spkCursor += 13
            # Read the numeric characters that make up the timestamp.
            while oldstring1[spkCursor] != '"':
                timeStart += oldstring1[spkCursor]
                spkCursor += 1
            # Now we have the beginning of the timestamp. Get the speaker ID.
            # First, fast-forward to where the speaker is identified.
            spkCursor += 23
            while oldstring1[spkCursor] != '"':
                thisSpeaker += oldstring1[spkCursor]
                spkCursor += 1
            # Now get the end of the timestamp.
            spkCursor = oldstring1.find(':',spkCursor)
            spkCursor += 2
            while oldstring1[spkCursor] != '"':
                timeEnd += oldstring1[spkCursor]
                spkCursor += 1
            # Make the dictionary entry and reset values.
            self.timestamps[timeStart] = [thisSpeaker,timeEnd,'']
            timeStart,thisSpeaker,timeEnd = '','',''
        # With all of that accomplished, this for-loop will now go through every entry in the dictionary in chronological order.
        # It will find that timestamp in the second string, the one where timestamps are mapped to words. It will collect the word
        # as a variable and stash that in the dictionary entry as well.
        for key in self.timestamps:
            wordCursor = oldstring2.find('"start_time":"' + key)
            wordCursor = oldstring2.find('"content":',wordCursor) + 11
            word = ''
            while oldstring2[wordCursor] != '"':
                word += oldstring2[wordCursor]
                wordCursor += 1
            # There's an unaccountable quirk – I don't know if it's a bug – in AWS where occasionally a nonverbal segment has no listed
            # start_time in the second string, even though it does in the first string. It has generated odd results in the CHAT file,
            # which writes the string 'ms' in these cases. (The Python script is looking for "start_time" as you can see above, so
            # if that isn't there in the second string, weirdness follows.) This if-clause just takes the expedient step of replacing "ms"
            # with a pause.
            if word == 'ms': self.timestamps[key][2] = '(.)'
            else: self.timestamps[key][2] = word

    def milliseconds(self,timestamp):
        return str(int(float(timestamp)*1000))

    # Remove backslashes from the word, and decapitalize it unless it's "I" or a contraction with "I".
    def editedWord(self,word):
        word = word.replace("\\","")
        if word == 'I': pass
        elif len(word) > 2 and word[0:2] == "I'": pass
        else: word = word.lower()
        return word

    # This routine will write the actual CHAT file transcript. It will go through each timestamp in the dictionary and write that
    # accompanying word into the CHAT transcript. It checks speaker IDs to decide when to break for a new line in the CHAT file.
    def writeTranscript(self):
        # First, we need to determine what the first speaker ID is. Surprisingly, that's not always 0.
        # Occasionally AWS will assign speaker identity 1 to the first speaker in the transcript, and 0 to the other one.
        # We don't want to trip on that, so we'll check the ID of the first entry in self.timestamps.
        firstTimestampEntry = list(self.timestamps.keys())[0]
        thisSpeaker = self.timestamps[firstTimestampEntry][0]
        self.newfile.write('*SPK%s:\t' % thisSpeaker)
        for key in self.timestamps:
            # Don't even write any (.) bullets, thus avoiding the "bug" mentioned above.
            if self.timestamps[key][2] == '(.)': continue
            # Otherwise, write the word.
            if self.timestamps[key][0] != thisSpeaker:
                thisSpeaker = self.timestamps[key][0]
                self.newfile.write('.\n*SPK%s:\t' % thisSpeaker)
            self.newfile.write('%s •%s_%s• ' % (self.editedWord(self.timestamps[key][2]),
                                                self.milliseconds(key),
                                                self.milliseconds(self.timestamps[key][1])))
        self.newfile.write('.\n@End')


# This routine converts every .json file sitting in the same folder as this .py file into a CHAT file.
def multiConvert(numspeakers=2,lang='eng',corpus='corpus_name',media='audio'):
    for filename in os.listdir():
        if filename.endswith('.json'):
            ConvertJob(filename,numspeakers,lang,corpus,media)
    print('Finished')





# **************************************************************************************************************************************
# EXECUTABLE LINES *********************************************************************************************************************
# **************************************************************************************************************************************


# Below are three choices for an executable line. The regular multiConvert() line will convert all .json files
# using the default parameters.
#multiConvert()
# ---
# If you want to change any of those default parameters for all the CHAT files, such as if you want to specify
# a corpus name or if you know that the media is video instead of audio, you can use this line instead,
# replacing whichever parameters you want to change. This is just here for convenience and clarity.
#multiConvert(2,'eng','corpus_name','audio')
# ---
# If you only want to convert a single .json file in the same folder as this .py file, you can use this line.
# Replace 'filename.json' with the target filename.
job = ConvertJob('37-new.json')



# Note: When you try to open the newly-created .cha file, you may get a popup message: "Data file [path/filename] has been changed", etc.
# Ignore this. It's just a notification, not a warning that anything's wrong.
# You will, however, want to run chstring on the resulting file, as it wasn't created with any sense of margins.



