wordnet的一些用法

2019-05-07  本文已影响0人  VanJordan
from nltk.corpus import wordnet as wn
import argparse
import random
import re

parser = argparse.ArgumentParser()
parser.add_argument("--negative_num", type=int, default=3, help="the ratio of doing data augment")
parser.add_argument('--level', type=str, default='down', help='down up or same mean hyponyms or random')
parser.add_argument('--every', type=int, default=1, help='for every sentence choice how much words to be replaced')
parser.add_argument('--seed', type = int , default = 1234, help = 'random number seed')
args = parser.parse_args()
# vocab_path = './data/vocab.txt'
# filename_list = ['./data/rt-polarity.neg','rt-polarity.pos']
vocab_path = 'D:\\exchange\\adversarial_text\\src\\data\\vocab.txt'
filename_list = ["D:\\exchange\\adversarial_text\\src\\data\\rt-polarity.neg", \
                 "D:\\exchange\\adversarial_text\\src\\data\\rt-polarity.pos"]

replace_num = 0
all_num = 0

word2id = {}
random.seed(args.seed)

def print_log(file):
    def write_log(s):
        print(s)
        with open(file, 'a') as f:
            f.write(str(s) + '\n')

    return write_log


print_log = print_log("D:\\exchange\\adversarial_text\\src\\data\\log.log")
with open(vocab_path, 'r', encoding='iso8859-1') as f:
    for idx, word in enumerate(f.readlines()):
        word2id[word.strip()] = idx

print_log('word2id:'+str(len(word2id)))
def get_candidate(word=None, level='down'):
    word_candidate = []
    if level == 'down':
        for _, sys in enumerate(wn.synsets(word)):  # for its every hyponyms()
            for hyp in sys.hyponyms():
                for word in hyp.lemma_names():
                    word_candidate.append(word)

    elif level == 'up':
        for _, sys in enumerate(wn.synsets(word)):  # for its every hyponyms()
            for hyp in sys.hypernyms():
                for word in hyp.lemma_names():
                    word_candidate.append(word)

    elif level == 'same':
        for _, sys in enumerate(wn.synsets(word)):  # for its every hyponyms()
            for word in sys.lemma_names():
                word_candidate.append(word)

    else:
        raise ValueError('please check the input augment \"level\" ,it must be one of down ,up ,same,or random'
                         )

    random.shuffle(word_candidate)
    return word_candidate


for filename in filename_list:
    with open(filename, 'r', encoding='iso8859-1') as fr, \
            open(filename.split('.')[0] + '_' + str(args.negative_num) + '_'+str(args.every) + '_' + args.level + '.' + filename.split('.')[
                -1], 'w', encoding='iso8859-1') as fw:
        for line in fr:
            all_num += 1
            line = line.strip()
            fw.write(line + '\n')
            replace_word_list = []

            for _ in range(args.negative_num):
                word_list = line.split()
                flags = 0
                if flags < args.every:

                    for _ in range(len(word_list)):
                        choice_idx = random.choice(range(len(word_list)))  # random choice a word to be replaced
                        word = word_list[choice_idx]  # random choice a word to be replaced
                        if wn.synsets(word):
                            break

                    if args.level == 'random':
                        word_candidate = get_candidate(word, random.choice(['same', 'down', 'up']))
                    else:
                        word_candidate = get_candidate(word, args.level)

                    for word_replace in word_candidate:  # for every word
                        if word_replace not in word2id:  # if this word not in original training data then skip it
                            continue
                        if len(word_replace) <= 3 or len(word) <= 3:  # if word lenth is too short then skip it
                            continue
                        if word in word_replace or word_replace in word:  # if word and replace word from same morpheme then skip it
                            continue
                        if word_replace in replace_word_list or word == word_replace:  # if word has be changed  then skip it
                            continue

                        word_list[choice_idx] = word_replace
                        print(word_replace + '\t' + word)
                        fw.write(' '.join(word_list) + '\n')
                        replace_num += 1
                        flags += 1
                        replace_word_list.append(word_replace)
                        break
                else:
                    break

print('all_num is %d, replace_num is %d, ratio is %f' % (all_num, replace_num, replace_num / all_num))


上一篇 下一篇

猜你喜欢

热点阅读