wordnet的一些用法
2019-05-07 本文已影响0人
VanJordan
from nltk.corpus import wordnet as wn
import argparse
import random
import re
parser = argparse.ArgumentParser()
parser.add_argument("--negative_num", type=int, default=3, help="the ratio of doing data augment")
parser.add_argument('--level', type=str, default='down', help='down up or same mean hyponyms or random')
parser.add_argument('--every', type=int, default=1, help='for every sentence choice how much words to be replaced')
parser.add_argument('--seed', type = int , default = 1234, help = 'random number seed')
args = parser.parse_args()
# vocab_path = './data/vocab.txt'
# filename_list = ['./data/rt-polarity.neg','rt-polarity.pos']
vocab_path = 'D:\\exchange\\adversarial_text\\src\\data\\vocab.txt'
filename_list = ["D:\\exchange\\adversarial_text\\src\\data\\rt-polarity.neg", \
"D:\\exchange\\adversarial_text\\src\\data\\rt-polarity.pos"]
replace_num = 0
all_num = 0
word2id = {}
random.seed(args.seed)
def print_log(file):
def write_log(s):
print(s)
with open(file, 'a') as f:
f.write(str(s) + '\n')
return write_log
print_log = print_log("D:\\exchange\\adversarial_text\\src\\data\\log.log")
with open(vocab_path, 'r', encoding='iso8859-1') as f:
for idx, word in enumerate(f.readlines()):
word2id[word.strip()] = idx
print_log('word2id:'+str(len(word2id)))
def get_candidate(word=None, level='down'):
word_candidate = []
if level == 'down':
for _, sys in enumerate(wn.synsets(word)): # for its every hyponyms()
for hyp in sys.hyponyms():
for word in hyp.lemma_names():
word_candidate.append(word)
elif level == 'up':
for _, sys in enumerate(wn.synsets(word)): # for its every hyponyms()
for hyp in sys.hypernyms():
for word in hyp.lemma_names():
word_candidate.append(word)
elif level == 'same':
for _, sys in enumerate(wn.synsets(word)): # for its every hyponyms()
for word in sys.lemma_names():
word_candidate.append(word)
else:
raise ValueError('please check the input augment \"level\" ,it must be one of down ,up ,same,or random'
)
random.shuffle(word_candidate)
return word_candidate
for filename in filename_list:
with open(filename, 'r', encoding='iso8859-1') as fr, \
open(filename.split('.')[0] + '_' + str(args.negative_num) + '_'+str(args.every) + '_' + args.level + '.' + filename.split('.')[
-1], 'w', encoding='iso8859-1') as fw:
for line in fr:
all_num += 1
line = line.strip()
fw.write(line + '\n')
replace_word_list = []
for _ in range(args.negative_num):
word_list = line.split()
flags = 0
if flags < args.every:
for _ in range(len(word_list)):
choice_idx = random.choice(range(len(word_list))) # random choice a word to be replaced
word = word_list[choice_idx] # random choice a word to be replaced
if wn.synsets(word):
break
if args.level == 'random':
word_candidate = get_candidate(word, random.choice(['same', 'down', 'up']))
else:
word_candidate = get_candidate(word, args.level)
for word_replace in word_candidate: # for every word
if word_replace not in word2id: # if this word not in original training data then skip it
continue
if len(word_replace) <= 3 or len(word) <= 3: # if word lenth is too short then skip it
continue
if word in word_replace or word_replace in word: # if word and replace word from same morpheme then skip it
continue
if word_replace in replace_word_list or word == word_replace: # if word has be changed then skip it
continue
word_list[choice_idx] = word_replace
print(word_replace + '\t' + word)
fw.write(' '.join(word_list) + '\n')
replace_num += 1
flags += 1
replace_word_list.append(word_replace)
break
else:
break
print('all_num is %d, replace_num is %d, ratio is %f' % (all_num, replace_num, replace_num / all_num))