### grid.py ### Kyle Gorman (kgorman@ling.upenn.edu) ### Praat, Carmel, ARPABET and HTK functions ### part of p2tk # Carmel functions def shift(prefix): """ remove something on left edge, return hashable """ prefix = list(prefix); prefix.pop(0); return tuple(prefix) def push(prefix, phone): """ add something on right edge, return hashable """ prefix = list(prefix); prefix.append(phone); return tuple(prefix) def slide(prefix, phone): """ add something on right edge and remove something on left edge """ return shift(push(prefix, phone)) def carmel(instate, otstate, insymbol = '*e*', otsymbol = '*e*', p = ''): """ takes start state, end state, in and out symbols, and probability (and bang and the state number if you want to tie it) arguments, and returns the string to write() this in carmel format. """ return '(%s (%s %s %s %s))\n' % (instate, otstate, insymbol, otsymbol, p) def carmel_EM(inModel, otModel, devTxt, nIter=50, nRand=50, perpRatio=1.0-1e-8): """ start carmel EM """ from os import popen return popen('carmel.cage -tC:? -M %d -! %d -X %f %s %s > %s' % (nIter, \ nRand, perpRatio, devTxt, inModel, otModel)) def carmel_test(model, tstTxt, CPL): """ start carmel -S corpus probability tester """ from popen2 import popen3 (a, b, c) = popen3('carmel.cage -S %s %s' % (tstTxt, model)) a.readlines() # why do i have to do this? return float(c.readline().rstrip().split('^')[-1]) / float(CPL) def capScheme(word): """ given a word, return its' capitalization scheme """ from string import lowercase, uppercase string = [(x in lowercase) - (x in uppercase) for x in word] if sum(string) == len(word) or -1 not in string: # second case: low w/ punc return 'L+' elif sum(string) == -len(word) or 1 not in string: # second case: up w/ punc return 'U+' elif string[0] == -1 and -1 not in string[1:]: # first capped, rest down return 'UL+' elif string[0] > -1 and string[1] == -1 and -1 not in string[2:]: # ie -UL+ return 'LUL+' else: return tuple(string) def cmudict(): """ returns dictionary object containing CMUDICT words (lower case) as keys and a list of (upper case) phonemes as values """ from os.path import exists assert exists('cmudict.06d'), 'cmudict.06d must be in directory' dict = {} for line in open('cmudict.06d', 'r'): pieces = line.rstrip().split() if not pieces[0][-1] == ')': # the kind that has a '(1)' at the end dict[pieces.pop(0).lower()] = pieces return dict # praat and HTK functions def read(inputFile): """ Input: name of Praat TextGrid file Output: List of tiers, which are lists of (string,time) tuples (in the case of point tiers), or (string,start,stop) tuples (in the case of interval tiers) """ from os import path, popen, system file = open(inputFile, 'r') # read it in lines = file.readlines() # sadly read it all into memory file.close() # close it out lines.pop(0) # file type lines.pop(0) # object class lines.pop(0) # blank tiers = [] # data structure that will contain all the tiers if 'xmin' in lines[0]: # long TextGrid if this is true lines.pop(0) # xmin lines.pop(0) # xmax lines.pop(0) # tiers? junk,nTiers = lines.pop(0).rstrip().split(' = ') # size lines.pop(0) # item [] for i in range(int(nTiers)): # loop over the tiers lines.pop(0) # the first tier's start if 'IntervalTier' in lines[0]: # we can check directly for interval lines.pop(0) # class type lines.pop(0) # tier name lines.pop(0) # xmin lines.pop(0) # xmax junk,nInter = lines.pop(0).rstrip().split(' = ') # size intervalTier = [] # data struct, gonna be full of tuples for j in range(0,int(nInter)): # loop over interval tier itself lines.pop(0) # interval number junk,xmin = lines.pop(0).rstrip().split(' = ') # start junk,xmax = lines.pop(0).rstrip().split(' = ') # stop junk,label = lines.pop(0).rstrip().split(' = ') # label intervalTier.append((float(xmin), float(xmax), \ label.strip('"'))) tiers.append(intervalTier) # now write it into big structure else: # if point tier lines.pop(0) # class type lines.pop(0) # tier name lines.pop(0) # xmin lines.pop(0) # xmax junk,nInter = lines.pop(0).rstrip().split(' = ') # size pointTier = [] # data struct, gonna be full of tuples for j in range(0,int(nInter)): # loop over point tier itself lines.pop(0) # point number junk,time = lines.pop(0).rstrip().split(' = ') # time junk,label = lines.pop(0).rstrip().split(' = ') # label pointTier.append((float(time),label.strip('"'))) # save it tiers.append(pointTier) # now write it into big structure else: # short TextGrid format lines.pop(0) # xmin lines.pop(0) # xmax lines.pop(0) # tiers nTiers = lines.pop(0).rstrip() # size for i in range(0, int(nTiers)): # loop over the tiers if 'IntervalTier' in lines[0]: lines.pop(0) # class type lines.pop(0) # tier name lines.pop(0) # xmin lines.pop(0) # xmax nInter = lines.pop(0).rstrip() # interval size intervalTier = [] # data struct, gonna be full of tuples for j in range(int(nInter)): # loop over the ier xmin = float(lines.pop(0).rstrip()) # start xmax = float(lines.pop(0).rstrip()) # stop label = lines.pop(0).rstrip() # label intervalTier.append((xmin, xmax, label.strip('"'))) # save tiers.append(intervalTier) # now write it into big structure else: # point tier lines.pop(0) # class type lines.pop(0) # tier name lines.pop(0) # xmin lines.pop(0) # xmax nInter = lines.pop(0).rstrip() # interval size pointTier = [] # data struct, gonna be full of tuples for j in range(int(nInter)): # loop over the ier time = float(lines.pop(0).rstrip()) # time label = lines.pop(0).rstrip() # label pointTier.append((time,label.strip('"'))) # save it tiers.append(pointTier) # now write it into big structure return tiers # now we are done # write a Praat TextGrid def write(list, outputFile, format=None): """ Input: lists of tiers, which are lists of (string,time) tuples (in the case of point tiers) in temporal order and/or lists of (string,start,stop) tuples (in the case of interval tiers), an output filename, and an output filename(,format i.e. TextGrid is 'short' format if non-null) Output: none, but TextGrid is printed to output file """ from os import path, popen, system file = open(outputFile, 'w') # open for writing xmin,xmax = (),None # positive and negative infinity if (format): # nonnull, so short file.write('File type = "ooTextFile"\n') # write header first line file.write('Object class = "TextGrid"\n\n') # 2nd and third line for tier in list: # loop over tiers if len(tier[0]) > 2: # interval if tier[0][0] < xmin: # if a smaller xmin xmin = tier[0][0] # save it if tier[-1][2] > xmax: # if a bigger xmax xmax = tier[-1][1] # save it else: # point tier if tier[0][0] < xmin: # if a smaller xmin xmin = tier[0][0] # save it if tier[-1][0] > xmax: # if a bigger xmax xmax = tier[-1][0] # save it file.write(str(xmin) + '\n') # xmin file.write(str(xmax) + '\n') # xmax file.write('\n') # tiers line file.write(str(len(list)) + '\n') # number of tiers tCounter = 1 # keep track of the number of tiers for tier in list: # loop over tiers if len(tier[0]) > 2: # is it start/stop or just point? file.write('"IntervalTier"\n') # class label file.write('"' + str(tCounter) + '"\n') # n/m file.write(str(xmin) + '\n') # xmin file.write(str(xmax) + '\n') # xmax file.write(str(len(tier)) +'\n') # number of intervals iCounter = 1 # keep track of the number of intervals for intrvl in tier: # loop over vals file.write(str(intrvl[0]) + '\n') # xmin file.write(str(intrvl[1]) + '\n') # xmax file.write('"' + intrvl[2] + '"\n') # label iCounter = iCounter + 1 # increment interval counter else: # type is point tier file.write('"TextTier"\n') # class label file.write('"' + str(tCounter) + '"\n') file.write(str(xmin) + '\n') # xmin file.write(str(xmax) + '\n') # xmax file.write(str(len(tier)) +'\n') pCounter = 1 # keep track of the number of points for point in tier: # loop over vals file.write(str(point[0]) + '\n') file.write('"' + point[1] + '"\n') pCounter = pCounter + 1 # increment point counter tCounter = tCounter + 1 # increment tier counter else: # long format TextGrid file.write('File type = "ooTextFile"\n') # write header first line file.write('Object class = "TextGrid"\n\n') # 2nd and third line for tier in list: # loop over tiers if len(tier[0]) > 2: # is it interval? if tier[0][0] < xmin: # if a smaller xmin xmin = tier[0][0] # save it if tier[-1][1] > xmax: # if a bigger xmax xmax = tier[-1][1] # save it else: # point tier if tier[0][0] < xmin: # if a smaller xmin xmin = tier[0][0] # save it if tier[-1][0] > xmax: # if a bigger xmax xmax = tier[-1][0] # save it file.write('xmin = ' + str(xmin) + '\n') # xmin file.write('xmax = ' + str(xmax) + '\n') # xmax file.write('tiers? \n') # tiers line file.write('size = ' + str(len(list)) + '\n') # number of tiers file.write('item []:\n') # last piece of header tCounter = 1 # keep track of the number of tiers for tier in list: # loop over tiers file.write(' item [' + str(tCounter) + ']:\n') # item number if len(tier[0]) > 2: # is it start/stop or just point? file.write(' class = "IntervalTier"\n') # class label file.write(' name = "' + str(tCounter) + '"\n') # n/m file.write(' xmin = ' + str(xmin) + '\n') # xmin file.write(' xmax = ' + str(xmax) + '\n') # xmax file.write(' intervals: size = ' + str(len(tier)) +'\n') iCounter = 1 # keep track of the number of intervals for intrvl in tier: # loop over vals file.write(' intervals [' + str(iCounter) + ']:\n') file.write(' xmin = ' + str(intrvl[0]) + '\n') file.write(' xmax = ' + str(intrvl[1]) + '\n') file.write(' text = "' + intrvl[2] + '"\n') iCounter = iCounter + 1 # increment interval counter else: # type is point tier file.write(' class = "TextTier"\n') # class label file.write(' name = "' + str(tCounter) + '"\n') file.write(' xmin = ' + str(xmin) + '\n') # xmin file.write(' xmax = ' + str(xmax) + '\n') # xmax file.write(' points: size = ' + str(len(tier)) +'\n') pCounter = 1 # keep track of the number of points for point in tier: # loop over vals file.write(' points [' + str(pCounter) + ']:\n') file.write(' time = ' + str(point[0]) + '\n') file.write(' mark = "' + point[1] + '"\n') pCounter = pCounter + 1 # increment point counter tCounter = tCounter + 1 # increment tier counter file.close() # close it out def readMLF(inputFile): """ Input: name of HTK .mlf file created by issuing the command HVITE -o SM... Output: A list of tuples. Each tuple is a (string,list) pair. The string corresponds to the string denoting the file used to generate the .mlf. The list is a grid, a list of tiers. The list is always two items long (but not a tuple for conformity to the TextGrid functions above). The first list is the phones list. The second list is the word list. Each one of these lists is a list of (string,start,stop) tuples. By passing each list in the top-level tuple to an appropriately named file via writeTextGrid(), you can create TextGrids for all the files in an .mlf. """ from os import path, popen, system file = open(inputFile,'r') # open file lines = file.readlines() # and read it in file.close() # let the file go lines.pop(0) # get rid of the useless first line name, word, words, phones, gridList = '', (), [], [], [] sr = 10000000 # 100 ns sampling rate for .mlfs, i think this is fixed? for line in lines: # loop over lines if line[0] == '"': # look for filename gridList.append((name,[words, phones])) # write out that word,words,phones = '',[],[] # reset these folder,item = path.split(line.lstrip('"').rstrip().rstrip('"')) name,ext = path.splitext(item) # get file name else: # actual data strings = line.split() # get line if len(strings) == 4: # string and word if len(word) == 2: # not the blank initial one words.append((word[0], float(strings[0]) / sr, word[1])) word = (float(strings[0]) / sr, strings[3]) # save word for l8r phones.append((float(strings[0]) / sr, float(strings[1]) / sr, \ strings[2])) # phones elif len(strings) == 3: # string only, if it's 1, do nothing phones.append((float(strings[0]) / sr, float(strings[1]) / sr, \ strings[2])) # phones else: # string is period, dump al the data and the word words.append((word[0], phones[len(phones) - 1][1], word[1])) word = () # clean this up for later gridList.pop(0) # hack to save a bunch of conditional checking return gridList # this is a list of grids which are lists and so on def readPitchTier(inputFile): """ def readPitchTier(inputFile): Input: Praat PitchTier file name Output: List of (time,F0) tuples """ from os import path, popen, system file = open(inputFile, 'r') # read it in lines = file.readlines() # sadly, put it all into memory file.close() # close it out lines.pop(0) # file type lines.pop(0) # object class lines.pop(0) # blank f0s = [] # f0 best candidates if 'xmin' in lines[0]: # long PitchTier if this is true lines.pop(0) # xmin lines.pop(0) # xmax lines.pop(0) # size while lines: # loop over lines, sorta lines.pop(0) # 'points [n]' header line = lines.pop(0).split(' = ') # split time = float(line[1]) line = lines.pop(0).split(' = ') # split f0s.append((time,float(line[1]))) # tuple here else: # short PitchTier lines.pop(0) # xmin lines.pop(0) # xmax lines.pop(0) # size while lines: # loop over lines, sorta f0s.append((float(lines.pop(0)), float(lines.pop(0)))) # tuple return f0s # return statement # ARPABET functions def monothongs(): return ['IY','UW','IH','UH','EH','AH','AE','AO','AA'] def diphthongs(): return ['EY','OW','OY','AY','AW'] def rhoticVowels(): return ['AOR','AAR','IHR','URH','EHR','ER'] def vowels(): return monothongs() + diphthongs() + rhoticVowels() def vlStops(): return ['P','T','K'] def vdStops(): return ['B','D','G'] def stops(): return vlStops() + vdStops() def affricates(): return ['JH','CH'] def vlFricatives(): return ['SH','TH','S','F','HH'] def vdFricatives(): return ['DH','ZH','V','Z'] def fricatives(): return vdFricatives() + vlFricatives() def vlObstruents(): return vlFricatives() + vlStops() + ['CH'] def vdObstruents(): return vdFricatives() + vdStops() + ['JH'] def obstruents(): return vlObstruents() + vdObstruents() def nasals(): return ['M','N','NG'] def approximants(): return ['R','Y','L','W'] def sonorants(): return nasals() + approximants() def voiceless(): return vlFricatives() + vlStops() + ['CH'] def voiced(): return vdFricatives() + vdStops() + ['JH'] + sonorants() def consonants(): return voiceless() + voiced() def arpabet(): return vowels() + consonants()