import sys import re import string finame = sys.argv[1] cmufname = sys.argv[2] fi = open(finame) cmuf = open(cmufname) #fi = open("/Users/joseffruehwald/Documents/Classes/Fall_09/Quant/jean.txt") #cmuf = open("/Users/joseffruehwald/Desktop/durations/recode_word/cmudict.0.7a.JTFedit2009") cmu = {} words = [] ### Import CMU into a dictionary while cmuf: line = cmuf.readline().rstrip() if len(line) < 1: break elif ";;;" not in line: line = line.split(" ") line[0] = line[0].replace("'","") if "(" not in line[0]: cmu[line[0]] = line[1].split(" ") #### Syllabifying Function def syllabify(w): import re #w = ["S", "T", "R", "EH1", "NG", "TH", "AH0", "N"] #strengthen #w = ["M", "AE2", "S", "T", "ER0"] #master #w = ["B","AE","N","ER"] #banner #w = ["B","AE","N","T","ER"] #banter #w = ["P","R","AE","NG","K","S","T","ER","Z"] #pranksters #w = ["S","IH","K","S","TH","S"] #sixths nuc = re.compile("A|E|I|O|U") n = 0 nucs = [] for i in range(len(w)): p = nuc.search(w[i]) if p: n = n+1 nucs.append(i) if "R" in w[i]: if i+1 == len(w): w.append("R") elif w[i+1] is not "R": w.insert(i+1,"R") syls = [] nsyls = [] for i in range(n): stress = re.compile("\d").search(w[nucs[i]]).group() if w[nucs[i]] == "AH0": w[nucs[i]] = "@" syls.append([[],[re.sub("\d","",w[nucs[i]]) ],[],[stress]]) #syls.append([[],[w[nucs[i]]],[]]) #print "Nucleus is "+w[nucs[i]] nsyls.append([[],[nucs[i]],[]]) #if "R" in w[nucs[i]]: # syls[i][2].append("R") while nsyls: if len(syls[i][0])<1: ons = nsyls[i][1][0]-1 else: ons = nsyls[i][0][0]-1 if ons==-1: #print "Onset ended: Word Begining" break if ons == nsyls[i-1][1][0]: #print "Onset ended : Syl Boundary" break if w[ons] == "NG": break okl = re.compile("B|F|G|K|P|S|S") okr = re.compile("B|D|F|G|K|P|T|V") oks = re.compile("K|L|M|N|P|T|V|W") okw = re.compile("T|D|K|P|S") oky = re.compile("B|F|V|K|G") if len(syls[i][0])==0: #print "Adding "+w[ons]+" to onset" syls[i][0].insert(0,w[ons]) nsyls[i][0].insert(0,ons) elif syls[i][0][0] == "L": if okl.match(w[ons]): #print "Adding "+w[ons]+" onset: Acceptable XL onset" syls[i][0].insert(0,w[ons]) nsyls[i][0].insert(0,ons) else: break elif syls[i][0][0] == "R": if okr.match(w[ons]): #print "Adding "+w[ons]+" onset: Acceptable XR onset" syls[i][0].insert(0,w[ons]) nsyls[i][0].insert(0,ons) else: break elif syls[i][0][0] == "W": if okw.match(w[ons]): #print "Adding "+w[ons]+" onset: Acceptable XR onset" syls[i][0].insert(0,w[ons]) nsyls[i][0].insert(0,ons) else: break elif syls[i][0][0] == "Y": if oky.match(w[ons]): #print "Adding "+w[ons]+" onset: Acceptable XR onset" syls[i][0].insert(0,w[ons]) nsyls[i][0].insert(0,ons) else: break elif oks.match(syls[i][0][0]): if w[ons] == "S": #print "Adding "+w[ons]+" onset: Acceptable SX onset" syls[i][0].insert(0,w[ons]) nsyls[i][0].insert(0,ons) else: break else: break for i in range(n): while nsyls: if len(nsyls[i][2]) == 0: cod = nsyls[i][1][0] + 1 else: cod = nsyls[i][2][-1]+1 if cod == len(w): #print "End Sylable "+str(i+1)+":End of Word" break elif i+1 == len(syls): #print "Adding "+w[cod]+" to Syl "+str(i+1)+" Coda" syls[i][2].append(w[cod]) nsyls[i][2].append(cod) elif len(nsyls[i+1][0]) == 0: if cod == nsyls[i+1][1][0]: #print "End Sylable "+str(i+1) break elif cod == nsyls[i+1][0][0]: #print "End Sylable "+str(i+1) break else: #print "Adding "+w[cod]+" to Syl "+str(i+1)+" Coda" syls[i][2].append(w[cod]) nsyls[i][2].append(cod) return syls ###### End Syllabifying Function #word+"\t"+depvar+"\t"+gram+"\t"+stress+"\t"+preseg+"\t"+folseg+"\t"+str(codsize)+"\t"+str(internal)+"\t"+folword+"\t"+string.join(chunk,sep = " ") print "Word\tDepVar\tTD\tGram\tStress\tPreSeg\tFolSeg\tPreSegCode\tFolSegCode\tCodaSize\tInternal\tFollowing Word\tWordsPerSecond\tChunk" folsegcoding = { "AA": "vowel", "AE": "vowel", "AH": "vowel", "AO": "vowel", "AW": "vowel", "AY": "vowel", "B": "stop", "CH": "stop", "D": "stop", "DH": "stop", "EH": "vowel", "ER": "vowel", "EY": "vowel", "F": "fricative", "G": "stop", "HH": "h", "IH": "vowel", "IY": "vowel", "JH": "stop", "K": "stop", "L": "l", "M": "nasal", "N": "nasal", "NG": "nasal", "OW": "vowel", "OY": "vowel", "P": "stop", "R": "r", "S": "fricative", "SH": "fricative", "T": "stop", "TH": "stop", "UH": "vowel", "UW": "vowel", "V": "fricative", "W": "w", "Y": "y", "Z": "fricative", "ZH": "fricative", "q": "q" } presegcoding = { "B": "obstruent", "CH": "obstruent", "DH": "fricative", "F": "fricative", "G": "obstruent", "JH": "obstruent", "K": "obstruent", "L": "l", "M": "nasal", "N": "nasal", "NG": "nasal", "P": "obstruent", "S": "sibilant", "SH": "sibilant", "TH": "fricatuve", "V": "fricative", "Z": "sibilant", "ZH": "sibilant" } while fi: line = fi.readline().rstrip() abc = re.compile("[A-Za-z]*") if len(line)<1: break line = line.split("\t") chunk = line[4].split(" ") ## Change this if tdf has different columns dig = re.compile("\d") punct = re.compile("\W") td = re.compile("[TD]") ## get WPS codings = 0 for i in range(len(chunk)): if dig.match(chunk[i]): codings = codings + 1 words = len(chunk)-codings start = float(line[2]) end = float(line[3]) dur = end - start wps = round(words / dur, 2) for i in range(len(chunk)): ## Look for coding strings if dig.match(chunk[i]): ## Determine if td is internal or word final if len(punct.sub("",chunk[i])) == 2: internal = False style = "other" elif len(punct.sub("",chunk[i])) == 3: if chunk[i][-1] == "i": internal = True style = "other" elif chunk[i][-1] == "n": internal = False style = "narrative" else: internal = "ERROR" style = "ERROR" elif len(punct.sub("",chunk[i])) == 4: internal = True style = "narrative" else: internal ="ERROR" style = "ERROR" ## Detemine if td is pre-pausal if i+1 == len(chunk): #Word at end of breath group folword = "{Q}" elif chunk[i][-1] == ".": ##period after coding chunk folword = "{Q}" elif chunk[i-1][-1] == ".": ##period after word folword = "{Q}" else: folword = punct.sub("",chunk[i+1]).upper() word = punct.sub("",chunk[i-1]).upper() depvar = chunk[i][0] gram = chunk[i][1] if word in cmu: syls = syllabify(cmu[word]) else: sys.stderr.write("Please transcribe "+word+"\n") trans = sys.stdin.readline().rstrip() trans = trans.split(" ") cmu[word] = trans syls = syllabify(trans) nsyls = len(syls) stress = "" preseg = "" folseg = "" codsize = "" seg = "" if internal: for j in range(len(syls)): if j == len(syls)-1: break elif len(syls[j][2]) > 1 and len(syls[j][2][-1]) == 1 and td.match(syls[j][2][-1]): seg = syls[j][2][-1] stress = syls[j][3][0] preseg = syls[j][2][-2] codsize = len(syls[j][2]) if len(syls[j+1][0]) == 0: folseg = dig.sub("",syls[j+1][1][0]) else: folseg = dig.sub("",syls[j+1][0][0]) else: seg = syls[-1][2][-1] stress = syls[-1][3][0] preseg = syls[-1][2][-2] codsize = len(syls[-1][2]) if folword == "{Q}": folseg = "q" elif folword not in cmu: sys.stderr.write("Please Provide First Segment of "+folword) folseg = sys.stdin.readline().rstrip() else: folseg = dig.sub("",cmu[folword][0]) if dig.sub("",folseg) in folsegcoding: folcode = folsegcoding[dig.sub("",folseg)] else: sys.stderr.write("ERROR: Following Segment not Recognized\n") folcode = "other" if dig.sub("",preseg) in presegcoding: precode = presegcoding[dig.sub("",preseg)] else: sys.stderr.write("ERROR: Preceding Segment not Recognized\n") precode = "other" print word+"\t"+depvar+"\t"+seg+"\t"+gram+"\t"+stress+"\t"+preseg+"\t"+folseg+"\t"+precode+"\t"+folcode+"\t"+str(codsize)+"\t"+str(internal)+"\t"+folword+"\t"+str(wps)+"\t"+string.join(chunk,sep = " ")