深度学习 姓名与性别

2020-04-15  本文已影响0人  small瓜瓜

姓名是否可以判断出性别呢?例如小红这个名字,我想应该是一个女性吧。为了测试看看姓名是否和性别有关,借助网上提供的数据集,编写以下代码进行测试。
代码如下:

import tensorflow as tf
from tensorflow.keras import Sequential, layers, losses, optimizers
import numpy as np

gpu = tf.config.experimental.list_physical_devices(device_type='GPU')
assert len(gpu) >= 1
tf.config.experimental.set_memory_growth(gpu[0], True)

NAME_MAX_LEN = 4


def get_word_index(ch: str):
    bs = ch.encode("unicode-escape")
    no = int(bs[2:], base=16)
    if no >= 19968 and no <= 40869:
        return no - 19967 + 6582
    elif no >= 13312 and no <= 19893:
        return no - 13312
    return 0


def get_word_text(idx: int):
    return chr(idx)


def encode_name(name):
    name_len = len(name)
    name_iter = iter(name)
    name_arr = [get_word_index(i) for i in name_iter]

    if name_len > NAME_MAX_LEN:
        x = name_arr[:NAME_MAX_LEN]
    else:
        x = name_arr + [0] * (NAME_MAX_LEN - name_len)
    return x


def get_name_sex(test_ratio=0.2):
    X = []
    Y = []

    with open(r'..\nameSex.txt', 'r', encoding='utf-8') as rf:
        for line in rf.readlines():
            fields = line.split(':')
            name = fields[0]
            sex = int(fields[1].strip())
            X.append(encode_name(name))
            Y.append(sex)

    train_num = int(len(X) * (1 - test_ratio))
    x_train, y_train = X[:train_num], Y[:train_num]

    x_test, y_test = X[train_num:], Y[train_num:]

    x_train_np_arr = np.ndarray((len(x_train),), dtype=object)
    for step, x in enumerate(x_train):
        x_train_np_arr.itemset(step, x)

    x_test_np_arr = np.ndarray((len(x_test),), dtype=object)
    for step, x in enumerate(x_test):
        x_test_np_arr.itemset(step, x)

    return (x_train, y_train), (x_test, y_test)


(train_data, train_labels), (test_data, test_labels) = get_name_sex()

vocab_size = 27500

model = Sequential([
    layers.Embedding(vocab_size, 10),
    layers.GRU(20, return_sequences=True),
    layers.GRU(20, dropout=0.5),
    layers.Dense(1, activation=tf.nn.sigmoid)
])

model.summary()

optimizer = optimizers.Adam(1e-2)

model.compile(optimizer=optimizer,
              loss=losses.binary_crossentropy,
              metrics=['accuracy'])

x_val = train_data[:1000]
partial_x_train = train_data[1000:]

y_val = train_labels[:1000]
partial_y_train = train_labels[1000:]


class ShowSaveCallback(tf.keras.callbacks.Callback):

    def __init__(self):
        super().__init__()

    def on_epoch_end(self, epoch, logs=None):
        print("log", logs)
        model.save("./model_back/{}_name_sex_model.h5".format(epoch))


history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=10,
                    validation_data=(x_val, y_val),
                    callbacks=[ShowSaveCallback()])

model.evaluate(test_data, test_labels)

nameSex数据结构:

欧:0
倩:0
妍:0
文竹:0
璐:0
彤彤:0
祎祎:0
紫微:0
裕贤:0
钰雯:0
睿:0
淑云:0
婧祺:0
玉:0
遥:0
丽香:0
钰:0
泸:1
烨婷:0
宸:1
佳琪:0
靖博:1
博安:1
阳枝山:1
靖茜:0

测试发现有关系,准确率在87%,说明大部分的人我们还是可以通过姓名判断其性别的,但是少部分不能。

上一篇 下一篇

猜你喜欢

热点阅读