import pandas as pd import re, glob, nltk, sys, os import nltk # Connective cuewords are defined in paper CUEWORDS_COMPARISON = set(['but', 'however', 'although', 'by contrast']) CUEWORDS_CONTINGENCY = set(['because', 'so', 'thus', 'as a result', 'consequently', 'therefore']) CUEWORDS_EXPANSION = set(['also', 'for example', 'in addition', 'instead', 'indeed', 'moreover', 'for instance', 'in fact', 'furthermore', 'or', 'and']) CUEWORDS_TEMPORAL = set(['then', 'previously', 'earlier', 'later', 'after', 'before']) # the following two list should match the sequence between each other CUEWORDS_ALL = [CUEWORDS_COMPARISON, CUEWORDS_CONTINGENCY, CUEWORDS_EXPANSION, CUEWORDS_TEMPORAL] RELATION_NAMES = ['comparison', 'contingency', 'expansion', 'temporal'] def identify_pairs(utt_text, utt_text_prev="", show=True): results = [] utt_text = utt_text.lower() # Divide the text to individual sentence, and deal with sentence separately text_lst = nltk.sent_tokenize(utt_text) # Load previous turn utterance, to provide info for argument in previous turn, if the connective appears at the beginning text_prev_lst = nltk.sent_tokenize(utt_text_prev) # Pick paris for each text in text list for text_count, text in enumerate(text_lst): text_pos = nltk.pos_tag(nltk.word_tokenize(text)) for relation_idx, cuewords_lst in enumerate(CUEWORDS_ALL): for cue_this in cuewords_lst: # find all appears of connectives iter = re.finditer(r"\b%s\b" % cue_this, text) indices = [m.start(0) for m in iter] if len(indices) > 0: # if do exist a connective of this kind of relation if show: print (text) print (cue_this) print (indices) for connective_index in indices: # if the connective is in the middle if connective_index == 0: if text_count > 0: arg1 = text_lst[text_count - 1] elif len(text_prev_lst) > 1: arg1 = text_prev_lst[-1] else: arg1 = '' arg2 = text type_this = 'begin' if show: print (type_this) else: # for each detected connectives arg1 = text[0:connective_index] arg2 = text[connective_index:] type_this = 'mid' if show: print (type_this) # Begin: Rules to filter out not common connectives cases # find the place of the connective select_flag = True # remove connectives arg2 = (arg2[len(cue_this):]).strip() try: # Filter 1: pos tag of the connective words if type_this == 'mid': connective_pos = text_pos[len(nltk.word_tokenize(arg1))][1] if show: print (text_pos[len(nltk.word_tokenize(arg1))]) else: connective_pos = text_pos[0][1] if show: print (text_pos[0]) if connective_pos not in ['IN', 'CC']: select_flag = False # Filter 2: length of arguments if len(nltk.word_tokenize(arg1)) < 3 or len(nltk.word_tokenize(arg2)) < 4: select_flag = False if cue_this == 'and' and (len(nltk.word_tokenize(arg1)) < 12 or len(nltk.word_tokenize(arg2)) < 12): select_flag = False if cue_this == 'and' and type_this == 'begin': select_flag = False if cue_this == 'or' and (len(nltk.word_tokenize(arg1)) < 12 or len(nltk.word_tokenize(arg2)) < 12): select_flag = False if cue_this == 'after' and (len(nltk.word_tokenize(arg1)) < 12 or len(nltk.word_tokenize(arg2)) < 12): select_flag = False if cue_this == 'before' and (len(nltk.word_tokenize(arg1)) < 6 or len(nltk.word_tokenize(arg2)) < 6): select_flag = False except: select_flag = False # End of rules if select_flag: # save to results dictionary list result = {} result['arg1'] = arg1 result['arg2'] = arg2 result['relation'] = RELATION_NAMES[relation_idx] result['connective'] = cue_this result['type'] = type_this result['original_utt'] = utt_text result['original_utt_prev'] = utt_text_prev results.append(result) if show: print ('->', RELATION_NAMES[relation_idx]) return results # Get pairs with cuewords def get_pairs(df, header, output, show=True): data = pd.DataFrame(columns=header) list_utt_name = 'Answer.sentence' index = 0 list_utt_range = 100 prev_text = "" for i in range(1, len(df.index)): # for i in range(1, 5): for range_i in range(1, list_utt_range): col_name_this = '%s%s' % (list_utt_name, str(range_i)) if col_name_this not in df.columns: # if there is not more utterances in this conversation, terminate break else: # for each utterance text = str(df.ix[i, col_name_this]) results = identify_pairs(text, prev_text, show=show) # update previous text prev_text = text # save result to csv row for result in results: for key, value in result.items(): data.loc[index, key] = value index += 1 data.to_csv(output, columns=header, index=False) if __name__ == '__main__': ''' Code for original edina corpus ''' header = ['arg1', 'arg2', 'relation', 'connective', 'type', 'original_utt', 'original_utt_prev'] for foldername in os.listdir('Edina/'): if os.path.isdir('Edina/%s' % foldername): print ('+++++ %s ...' % foldername) for filename in os.listdir('Edina/%s' % foldername): pathname = 'Edina/%s/%s' % (foldername, filename) df = pd.read_csv(pathname) output = 'Edina-DR2/Edina-DR/%s/pairs_%s' % (foldername, filename) if not os.path.exists('Edina-DR2/Edina-DR/%s' % foldername): os.makedirs('Edina-DR2/Edina-DR/%s' % foldername) get_pairs(df, header, output, show=False) ''' Code for debugging ''' # identify_pairs('I love you but do you love me?', show=True) # identify_pairs('I love elephant a lot but I go to zoo to see them', show=True) # identify_pairs('I love cute small elephant a lot and I go to zoo to see the love ones them', show=True) # identify_pairs('This is Tom and Jerry', show=True) # identify_pairs("no kidding, right, he's our best scorer. hopefully he'll be back before that game.")