#!/usr/bin/env python # Based on grs2depbank by James Curran # # Last updated 2 Sept 2008 LR import sys CONVERT = {'-LRB-': '(', '-RRB-': ')', '-LCB-': '{', '-RCB-': '}', '-LSB-': '[', '-RSB-': ']'} def punct_convert(s): for (old, new) in CONVERT.iteritems(): s = s.replace(old, new) return s def is_subset(list1,list2): subset = True for item in list1: if item not in list2: subset = False return subset PUNCT = set("( ) { } [ ] -- : ; , . ? !".split()) class Word: def __init__(self, token, index): self.index = index fields = token.split('|') self.token = fields[0] self.pos = fields[1] self.cat = fields[2] self.grs = [] self.sub = None def __cmp__(self, other): return cmp(self.index, other.index) def is_num(self): return self.pos == 'CD' def __repr__(self): if self.sub: return repr(self.sub) else: return '%s_%d' % (self.token, self.index) def convert_arg(arg, gr, words): if '_' in arg and arg != '_': word = words[arg] word.grs.append(gr) return word else: return arg def nopos(word): if type(word) == type(''): return word else: return word.token class GR: def __init__(self, line, words): self.line = line fields = line[1:-1].split() self.label = fields[0] self.args = [convert_arg(arg, self, words) for arg in fields[1:]] self.ignore = False def __repr__(self): return '(%s %s)' % (self.label, ' '.join(map(str, self.args))) def nopos(self): return '(%s %s)' % (self.label, ' '.join(map(nopos, self.args))) def replace(self, old, new): for i in xrange(len(self.args)): if self.args[i] == old: self.args[i] = new class Sentence: def __init__(self, grs, line): self.words = {} self.sentence = [] for (i, token) in enumerate(line.split()): word = Word(token, i) self.sentence.append(word) self.words[str(word)] = word self.grs = [GR(gr, self.words) for gr in grs] self.tokens = {} self.pos = {} self.cats = {} for w in self.sentence: self.tokens.setdefault(w.token, []).append(w) self.pos.setdefault(w.pos, []).append(w) self.cats.setdefault(w.cat, []).append(w) # NB subroutines from grs2depbank that aren't (currently) being used # are commented out def postprocess(self): if len(self.grs) == 0: return # some of these routines add GRs and we don't # want to do that if it was a parse failure self.rcmod_fix() # fixes a markedup bug self.question_aux_fix() # fixes a markedup bug self.det_to_poss() self.preconj() self.ecm_to_nsubj() self.add_funny_conj() self.fix_aux_conj() self.merge_conj_args() self.conj_to_appos() # self.remove_comma_conj() # self.add_passives() # self.ncmod_to_iobj() # self.fix_ncmod_num() self.num_and_number() # self.fix_question_poss() Don't need anymore, fixed in markedup self.in_vivo_vitro() self.switch_n_and_num() self.less_than() self.prep_to_appos() self.nn_and_amod() # REMOVE FOR BIOINFER # self.sfs_to_conj() # self.fix_ampersands() # self.remove_relpro_cmod() # self.cmod_to_ta() # self.ncmod_to_prt() # self.xmod_add_to() # self.fix_that_relpro() self.add_rel() # self.xmod_to_ta() # self.pct_ncmod_to_dobj() self.partmod_to_xcomp() self.only_dep() self.filter_punct() def rcmod_fix(self): """ fixes a bug in interpretation of markedup where a particular argument isn't output. Flagged with a special "rcmodfix" dependency label in markedup. """ grs = [gr for gr in self.grs if gr.label == 'rcmodfix'] for gr in grs: for gr2 in self.grs: if gr2.label == 'ref' and gr2.args[1] == gr.args[0]: gr.args[0] = gr2.args[0] gr.label = 'rcmod' def add_passives(self): for w in self.sentence: if w.cat.lstrip('(').startswith('S[pss]\NP') or \ (w.pos == 'VBN' and w.cat == 'N/N'): self.grs.append(GR('(passive %s)' % w, self.words)) def det_to_poss(self): """ converts det to poss when the token is a possessive """ poss = ['its', 'our', 'his', 'her', 'my', 'their', 'your'] for gr in self.grs: if gr.label == 'det' and gr.args[1].token in poss: gr.label = 'poss' def nn_and_amod(self): """ converts nmod to nn when pos=NN* and to amod when pos is anything else """ for gr in self.grs: if gr.label == 'nmod': if gr.args[1].pos.startswith('NN'): gr.label = 'nn' else: gr.label = 'amod' def fix_question_poss(self): """ fixes possessives in questions e.g. "what fruit's stone". they come out of the parser with "what" as an argument of the possessive rather than "fruit". specifically, changes: poss x y where y has pos=WDT to poss x z and possessive y w to possessive z w where in both cases z is the element such that there is a relation (det z y) """ for gr in self.grs: if gr.label == 'poss': if gr.args[1].pos == 'WDT': for gr2 in self.grs: if gr2.label == 'det' and gr2.args[1] == gr.args[1]: gr.args[1] = gr2.args[0] if gr.label == 'possessive': if gr.args[0].pos == 'WDT': for gr2 in self.grs: if gr2.label == 'det' and gr2.args[1] == gr.args[0]: gr.args[0] = gr2.args[0] def question_aux_fix(self): """ fixes a bug in interpretation of the markedup file, in which an aux is being produced in some relations where the main verb should actually be produced. Flagged with a special "nsubjfix" dependency label in markedup. """ grs = [gr for gr in self.grs if gr.label == 'nsubjfix'] for gr in grs: for gr2 in self.grs: if (gr2.label == 'aux' or gr2.label == 'cop') and gr2.args[1] == gr.args[0]: gr.args[0] = gr2.args[0] gr.label = 'nsubj' def add_funny_conj(self): for w in self.sentence: if w.cat == 'conj' and w.grs == []: prev = self.sentence[w.index - 1] gr = GR('(conj %s %s)' % (w, prev), self.words) self.grs.append(gr) # w.grs.append(gr) next = self.sentence[w.index + 1] gr = GR('(conj %s %s)' % (w, next), self.words) self.grs.append(gr) # w.grs.append(gr) def merge_conj_args(self): """ converts the rich parser output to more compact representation required by sd """ cc = {} # key = conjunction, value = list of its conjuncts # Add where the conjunction is a comma first for w in self.sentence: if w.token == ',': conj = [gr for gr in w.grs if gr.label == 'conj'] for gr in conj: cc.setdefault(str(w), []).append(gr.args[1]) gr.ignore = True # Remove subsets that happen with serial commas # e.g. for "a, b, c, and d" the parser will output conjunctions # consisting of c+d, b+c+d, a+b+c+d; we want to delete the # subsets and keep the superset for w,l in cc.iteritems(): for x,m in cc.iteritems(): if w != x and is_subset(l,m): cc[w].append('DELETE') # can't delete a dictionary entry while iterating through the dictionary - there should be cleaner ways to do this for w in cc.keys(): if 'DELETE' in cc[w]: del cc[w] # Add other conjunctions (conjunction is not a comma) # This is either the end of a list (x, y, and z / x, y and z), # a plain binary conjunction, or something like "is consistent # with, but does not prove". # At the end of a list, the subset check ensures that the conjunction # word gets used, but the original list is retained. # # Note that the parser already handles "x, y and z" by having the # comma conjoin all three items, otherwise this routine would # not work. addtocc = {} for w in self.sentence: if w.cat == 'conj': conj = [gr for gr in w.grs if gr.label == 'conj'] for gr in conj: gr.ignore = True conjuncts = [gr.args[1] for gr in w.grs if gr.label == 'conj'] if conjuncts: subset = False for x,l in cc.iteritems(): if is_subset(conjuncts,l): subset = True if 'DELETE' not in l: # if DELETE is there, the full list of conjuncts has already been added to addtocc with some conjunction word and we won't add another one (this usually happens when the parser has done something wrong with the coord anyway) (if we don't take this step, the list with DELETE in it gets into addtocc) (again there will be a cleaner way to do this) addtocc.setdefault(str(w), l[:]) cc[x].append('DELETE') if not subset: addtocc.setdefault(str(w), conjuncts) for w in cc.keys(): if 'DELETE' in cc[w]: del cc[w] for w,l in addtocc.iteritems(): cc.setdefault(w,l) # Sort all the lists so we can get the new grs right for w,l in cc.iteritems(): l.sort() # Add the new grs for w in cc.keys(): first_conjunct = cc[w][0] for c in cc[w][1:]: self.grs.append(GR('(conj %s %s)' % (first_conjunct, c), self.words)) # Add the cc newgr = GR('(cc %s %s)' % (first_conjunct, w), self.words) self.grs.append(newgr) # Finally, normalize any other grs in which all members of a # coordination participate - keep only the gr with the first conjunct grs_left = {} grs_right = {} for gr in self.grs: keyleft = str(gr.label)+" "+str(gr.args[0]) grs_left.setdefault(keyleft, []).append(gr.args[1]) keyright = str(gr.label)+" "+str(gr.args[1]) grs_right.setdefault(keyright, []).append(gr.args[0]) for k,l in grs_left.iteritems(): for w,c in cc.iteritems(): if k != w and is_subset(c,l): c.sort() labelparts = k.split() for item in c[1:]: testgr = GR('(%s %s %s)' % (labelparts[0], labelparts[1], item), self.words) for gr in self.grs: if gr.label == testgr.label and gr.args[0] == testgr.args[0] and gr.args[1] == testgr.args[1]: gr.ignore = True for k,l in grs_right.iteritems(): equal = False for w,c in cc.iteritems(): if k!= w and is_subset(c,l): c.sort labelparts = k.split() for item in c[1:]: testgr = GR('(%s %s %s)' % (labelparts[0], item, labelparts[1]), self.words) for gr in self.grs: if gr.label == testgr.label and gr.args[0] == testgr.args[0] and gr.args[1] == testgr.args[1]: gr.ignore = True def conj_to_appos(self): """ converts the pair of dependencies conj x y cc x , to the dependency: appos x y For example "x, a y, was found in ..." Takes advantage of the fact that the parser considers this a conjunction, and the merge_conj_args routine leaves it in this state because it never finds a conjunction word. Essentially this type of appos is like a conjunction without a conj word, so the comma is left hanging as a cc. Also takes advantage of the fact that the previous routine has already normalized GRs in which these things appear, so only the first element appears, which would have needed to be done for appositives anyway NB must be run AFTER merge_conj_args """ for gr in self.grs: if gr.label == 'cc' and gr.args[1].token == ',': gr.ignore = True for gr2 in self.grs: if gr2.label == 'conj' and gr2.args[0] == gr.args[0]: # could go wrong if there is something like "x, a y, and z" where there'd also be a (conj x z) relation that would get changed to appos. would require some fairly complicated checking but could be done. gr2.label = 'appos' def remove_comma_conj(self): for w in self.sentence: if w.token == ',' and w.grs: conj = [gr for gr in w.grs if gr.label == 'conj'] for gr in conj: gr.ignore = True if len(conj) == 2: head, mod = sorted([conj[0].args[1], conj[1].args[1]]) mod.sub = head head_grs = set([str(gr) for gr in head.grs]) for gr in mod.grs: if str(gr) in head_grs: gr.ignore = True mod.sub = None # self.grs.append(GR('(ncmod _ %s %s)' % (head, mod), self.words)) def num_and_number(self): """ converts nmod, amod, advmod, dep to dependency type num when the modifier is a number, i.e. has pos=CD or to dependency type number when both args are numbers """ deptypes = ['nmod', 'advmod', 'amod', 'dep'] for gr in self.grs: if gr.label in deptypes and gr.args[1].pos == 'CD': if gr.args[0].pos == 'CD': gr.label = 'number' gr.args[0], gr.args[1] = gr.args[1], gr.args[0] else: gr.label = 'num' def in_vivo_vitro(self): """ converts pobj in vivo/vitro to dep in vivo/vitro to match BioInfer. """ words = ['vivo', 'vitro'] for gr in self.grs: if gr.args[1].token in words and gr.args[0].token == 'in' and gr.label == 'pobj': gr.label = 'dep' def only_dep(self): """ converts advmod x only where only cat = ((S\NP)\(S\NP))/((S\NP)\(S\NP)) to dep x only to match BioInfer """ for gr in self.grs: if gr.label == 'advmod' and gr.args[1].token.lower() == 'only' and gr.args[1].cat == '((S\NP)\(S\NP))/((S\NP)\(S\NP))': gr.label = 'dep' def prep_to_appos(self): """ converts the pair of dependencies: pobj -LRB- x prep y -LRB- to the dependency: appox y x because parentheses are done as appos in BioInfer/SD. Should be possible to handle in markedup file, but not working at the moment. Also handles the case: nsubj y -LRB- rcmod x z to the dep: appos x y which comes from the category (NP\NP)/(S[dcl]\NP) just in case this is actually a parenthesis and not a relative pronoun. Also should be possible to handle in markedup file. This isn't perfect b/c if there are two rcmods it's indeterminate which one it will get - need to do some checking on the index. """ for gr in self.grs: if gr.label == 'prep' and ('-LRB-' in str(gr.args[1]) or '(' in str(gr.args[1])) and not gr.ignore: for gr2 in self.grs: if gr2.label == 'pobj' and gr.args[1] == gr2.args[0] and not gr2.ignore: gr.ignore = True self.grs.append(GR('(appos %s %s)' % (gr.args[0], gr2.args[1]), self.words)) gr2.ignore = True # note there may be more than one of these if another subroutine has modified the list - some may be ignore - just let it loop through all of them and not worry about it # The rcmod code isn't fully working for gr in self.grs: if gr.label == 'nsubj' and '-LRB-' in str(gr.args[1]): for gr2 in self.grs: if gr2.label == 'rcmod': gr.ignore = True gr2.ignore = True newgr = GR('(appos %s %s)' % (gr2.args[1],gr.args[0]), self.words) self.grs.append(newgr) break def switch_n_and_num(self): """ in our corpus if there is something like "residues 260-281" we may have residues|N/N 260-281|N, i.e. we may have the last item in the NP as the head, even if it is a number. however BioInfer has the number modifying the head as a num dependency. This switches those. If it finds: nmod y|CD x then it changes that to: num x y and anything of the form: reltype y z => reltype x z reltype z y => reltype z x NB needs to be run before prep_to_appos and nn_and_amond """ nums = [w for w in self.sentence if w.pos == 'CD'] for w in nums: prev = self.sentence[w.index - 1] if prev.cat == 'N/N' and w.cat == 'N': for gr in w.grs: if gr.label == 'nmod' and gr.args[0] == w and gr.args[1] == prev: # nmod y x => num x y gr.label = 'num' gr.args[0], gr.args[1] = gr.args[1], gr.args[0] elif gr.args[0] == w and gr.args[1] != prev: # reltype y z => reltype x z gr.ignore = True self.grs.append(GR('(%s %s %s)' % (gr.label, prev, gr.args[1]), self.words)) elif gr.args[1] == w and gr.args[0] != prev: # reltype z y => reltype z x gr.ignore = True self.grs.append(GR('(%s %s %s)' % (gr.label, gr.args[0], prev), self.words)) def fix_aux_conj(self): """ markedup_sd is set up so that aux+adj, aux+noun, aux+bare, aux+pass i.e. (s[dcl]\NP)/(S[adj]\NP) and (S[dcl]\NP)/NP and (S[dcl]\NP)/(S[b]\NP) and (S[dcl]\NP)/(S[pss]\NP) where the S[dcl] is an auxiliary, will give the adj/noun/bare as a head. However, in case those items are conjoined this doesn't work - the conjunction will still come out with the aux. This function fixes that. It also switches the head for some other relation types where we know it should be switched. (Not sure whether this should be everything but cop and aux, but playing it safe, just doing the ones we saw in dev data.) It relies on the raw parser output for conjunctions, so it must be run BEFORE merge_conj_args(). """ aux = ['ai', 'am', 'are', 'be', 'been', 'being', 'is', 'was', 'were', "'s", "'m", "'re", 'has', 'have', 'had', "'ve", 'do', 'did', 'does', "'d", "'ll", 'ca', 'can', 'could', 'may', 'might', 'must', 'ought', 'shall', 'should', 'will', 'wo', 'would', 'get', 'got', 'gotten', 'getting'] auxcats = ['(S[dcl]\NP)/(S[adj]\NP)', '(S[dcl]\NP)/NP', '(S[dcl]\NP)/(S[b]\NP)', '(S[dcl]\NP)/(S[pss]\NP)'] reltypes = ['ccomp', 'complm', 'rel', 'rcmod', 'advmod', 'partmod'] for gr in self.grs: # This is before we do merge_conj_args(), so the conjunction will be # args[0] and the aux will be args[1] if gr.label == 'conj' and gr.args[1].token in aux and gr.args[1].cat in auxcats: # Find the word that we need to switch in for the aux - # it will be the one which has this aux as its copula word = None for gr2 in self.grs: if (gr2.label == 'cop' or gr2.label == 'aux') and gr2.args[1] == gr.args[1]: word = gr2.args[0] if word: # change the other relations besides conj first for gr3 in self.grs: # could be on either side of the relation if gr3.label in reltypes and gr3.args[0] == gr.args[1]: gr3.args[0] = word if gr3.label in reltypes and gr3.args[1] == gr.args[1]: gr3.args[1] = word # now change the original conj relation gr.args[1] = word def partmod_to_xcomp(self): """ in cases like: suggested to act, data is presented to suggest, suggested to be involved, suggested to be a common region, proposed to act, predicted to encode, ... which always has the category sequence S[pss]\NP (S[to]\NP)/(S[b]\NP) the parser uses a unary rule and produces partmod x y This needs to be changed to xcomp x z where z is the complement of "to", which can be found because there is a relation (aux z to). """ for gr in self.grs: if gr.label == 'partmod' and gr.args[0].cat == 'S[pss]\NP' and gr.args[1].cat == '(S[to]\NP)/(S[b]\NP)': # Find the replacement word. # It will be the one which has this "to" as its aux. word = None for gr2 in self.grs: if gr2.label == 'aux' and gr2.args[1] == gr.args[1]: word = gr2.args[0] if word: gr.label = 'xcomp' gr.args[1] = word def ecm_to_nsubj(self): """ for ECM (and control) constructions like this: these|DT|NP[nb]/N reactions|NNS|N may|MD|(S[dcl]\NP)/(S[b]\NP) allow|VB|((S[b]\NP)/(S[to]\NP))/NP profilin|NN|N to|TO|(S[to]\NP)/(S[b]\NP) have|VB|(S[b]\NP)/NP These come out with (xsubj have profilin) and it should be nsubj. We deliberately left (dobj allow profilin) in the markedup file, even though Stanford doesn't want that, to have a hook to fix this construction with. So, this subroutine changes the pair: (dobj x y) where cat x ends in /NP (xsubj z y) to (nsubj z y) """ verbs = [w for w in self.sentence if w.cat.endswith('/NP')] for v in verbs: grs = [gr for gr in v.grs if gr.args[0] == v and gr.label == 'dobj'] for gr in grs: for gr2 in self.grs: if gr2.label == 'xsubj' and gr2.args[1] == gr.args[1]: gr.ignore = True gr2.label = 'nsubj' def preconj(self): """ converts advmod x both / advmod x either to preconj x both / preconj x either """ preconjwords = ['both', 'either'] for gr in self.grs: if gr.label == 'advmod' and gr.args[1].token.lower() in preconjwords: gr.label = 'preconj' def less_than(self): """ handles the less than construction in BioInfer. specifically if the sentence contains x < n (where n has pos = CD) then it changes the pair of dependencies: num < n nmod < x to the pair: num x n dep x < to match BioInfer. Then, since BioInfer considers the head of this construction to be x, and our script has made it <, it changes < in any other relation to x. NB must be run after switch_n_and_num() """ lts = [w for w in self.sentence if w.token == '<' and w.cat == 'N/N'] for w in lts: prev = self.sentence[w.index - 1] next = self.sentence[w.index + 1] if prev.cat == 'N/N' and next.pos == 'CD' and next.cat == 'N': for gr in w.grs: if gr.label == 'num' and gr.args[0] == w and gr.args[1] == next: gr.args[0] = prev elif gr.label == 'nmod' and gr.args[0] == w and gr.args[1] == prev: gr.label = 'dep' gr.args[0], gr.args[1] = gr.args[1], gr.args[0] elif gr.args[0] == w and gr.args[1] != prev: # reltype < z => reltype x z gr.ignore = True self.grs.append(GR('(%s %s %s)' % (gr.label, prev, gr.args[1]), self.words)) elif gr.args[1] == w and gr.args[0] != prev: # reltype z < => reltype z x gr.ignore = True self.grs.append(GR('(%s %s %s)' % (gr.label, gr.args[0], prev), self.words)) def fix_ncmod_num(self): for w in self.cats.get('N[num]', []): for gr in w.grs: if gr.label != 'ncmod': continue if gr.args[1] == w: gr.args[0] = 'num' gr.args[1], gr.args[2] = gr.args[2], gr.args[1] elif gr.args[2] == w: gr.args[0] = 'num' gr.args[2] = self.sentence[gr.args[1].index + 1] for gr in self.grs: if gr.label == 'ncmod' and \ (gr.args[1].is_num() or gr.args[2].is_num()): gr.args[0] = 'num' PART = set("several some most all any more less many".split()) def ncmod_to_iobj(self): for gr in self.grs: if gr.label == 'ncmod' and gr.args[2].token == 'of': if gr.args[1].token.lower() not in self.PART: gr.args.pop(0) gr.label = 'iobj' def sfs_to_conj(self): for gr in self.grs: if gr.label == 'ncmod' and gr.args[2].pos == 'CC' and gr.args[2].cat == 'S/S': self.grs.append(GR('(conj %s %s)' % (gr.args[2], gr.args[1]), self.words)) gr.ignore = True def pct_ncmod_to_dobj(self): for gr in self.grs: if gr.label == 'ncmod' and gr.args[0] == '_' and gr.args[2].token == '%': gr.args.pop(0) gr.label = 'dobj' def xmod_to_ta(self): for gr in self.grs: if gr.label == 'xmod': start, end = sorted([gr.args[1].index, gr.args[2].index]) for i in xrange(start + 1, end): if self.sentence[i].token == ',': gr.label = 'ta' for i in xrange(end + 1, len(self.sentence)): if self.sentence[i].token == ',': gr.args[0] = 'bal' break else: gr.args[0] = 'end' break def xmod_add_to(self): for gr in self.grs: if gr.label == 'xmod': start, end = sorted([gr.args[1].index, gr.args[2].index]) for i in xrange(start + 1, end): if self.sentence[i].token == 'to': gr.args[0] = 'to' break SAY = set("say said says".split()) def cmod_to_ta(self): for gr in self.grs: if gr.label == 'cmod': if gr.args[2].token.lower() in self.SAY: gr.label = 'ta' gr.args[0] = 'quote' gr.args[1], gr.args[2] = gr.args[2], gr.args[1] elif gr.args[1].token.lower() in self.SAY: gr.label = 'ta' gr.args[0] = 'quote' def ncmod_to_prt(self): for gr in self.grs: if gr.label == 'ncmod': if gr.args[1].pos.startswith('V') and gr.args[2].pos == 'RP': gr.args[0] = 'prt' def filter_punct(self): for gr in self.grs: for arg in gr.args: if isinstance(arg, Word) and arg.token in PUNCT: gr.ignore = True break def fix_ampersands(self): for w in self.sentence: if w.token == '&' and w.cat == 'N/N': prev = self.sentence[w.index - 1] next = self.sentence[w.index + 1] for gr in w.grs: gr.ignore = True for gr in next.grs: if prev in gr.args: gr.ignore = True else: gr.replace(next, w) self.grs.append(GR('(conj %s %s)' % (w, prev), self.words)) self.grs.append(GR('(conj %s %s)' % (w, next), self.words)) def remove_relpro_cmod(self): for gr in self.grs: if gr.label == 'cmod' and gr.args[0] != '_' and gr.args[0].token != 'that': gr.args[0] = '_' def fix_that_relpro(self): for gr in self.grs: if gr.label == "cmod" and gr.args[0] != '_' and gr.args[0].token == 'that': that = gr.args[0] for gr2 in that.grs: if gr2.label == "ncsubj" and gr2.args[1] == that: gr2.args[1] = gr.args[1] def add_rel(self): """ in case of these two deps: rcmod x y nsubj(pass) y that/which adds an ADDITIONAL dep: rel y that/which In other words, adds a dependency to mark the relative pronoun as being rel. Note that in BioInfer there are BOTH the nsubj and the rel dependencies. Haven't checked whether this is the case for Stanford parser in general. also adds a second ADDITIONAL dep: ref x that/which this can't be done in the markedup file because would be one-to-many for (NP\NP)/(S[dcl]\NP) """ for gr in self.grs: if gr.label == 'rcmod': verb = gr.args[1] for gr2 in verb.grs: if (gr2.label == 'nsubj' or gr2.label == 'nsubjpass') and gr2.args[0] == verb and (gr2.args[1].token == 'that' or gr2.args[1].token == 'which'): self.grs.append(GR('(rel %s %s)' % (gr2.args[0], gr2.args[1]), self.words)) self.grs.append(GR('(ref %s %s)' % (gr.args[0], gr2.args[1]), self.words)) def read(filename, OUTPUT): grs = [] for line in open(filename): line = line.strip() if not line or line.startswith('#'): if OUTPUT == '--ccgbank': print line continue # line = punct_convert(line) if line.startswith('('): grs.append(line) elif line.startswith(' '): yield Sentence(grs, line[4:]) grs = [] OUTPUT = sys.argv[1] if sys.argv[2].startswith('--'): if sys.argv[2] != '--no-postprocess': print >> sys.stderr, "unrecognised command line argument %s" % sys.argv[2] sys.exit(1) POSTPROCESS = False FILENAME = sys.argv[3] else: POSTPROCESS = True FILENAME = sys.argv[2] if OUTPUT == "--ccgbank": print "# generated by grs2sd" elif OUTPUT == '--rasp-parse': print "%LB (" print "%RB )" for i, s in enumerate(read(FILENAME, OUTPUT)): if POSTPROCESS: s.postprocess() if OUTPUT == '--ccgbank': for gr in s.grs: if not gr.ignore: print gr print '', for w in s.sentence: print '%s|%s|%s' % (w.token, w.pos, w.cat), print elif OUTPUT == '--rasp-text': print print i + 1 for w in s.sentence: print w.token, print elif OUTPUT == '--rasp-parse': print print i + 1 print for gr in s.grs: if not gr.ignore: print gr.nopos() elif OUTPUT == '--text': for w in s.sentence: print w.token, print elif OUTPUT == '--pos': for w in s.sentence: print '%s|%s' % (w.token, w.pos), print