山东算法赛网格事件智能分类topline
2021-12-30 本文已影响0人
致Great
- 赛题链接:http://data.sd.gov.cn/cmpt/cmptDetail.html?id=67
- baseline:https://aistudio.baidu.com/aistudio/projectdetail/3371314?contributionType=1
- 分数:0.749+
任务
(1)赛题任务
基于网格事件数据,对网格中的事件内容进行提取分析,对事件的类别进行划分,具体为根据提供的事件描述,对事件所属政务类型进行划分。
(2)数据使用规则
本赛题不能使用任何外部数据。
(3)AB榜
采用AB榜,A榜时间为从赛题开放提交到2022年1月18日,B榜时间为2022年1月19日到2022年1月21日。
数据
备注:报名参赛或加入队伍后,可获取数据下载权限。
本赛题提供下载数据,选手在本地进行算法调试,在比赛页面提交结果。赛题最多将提供不超过2.8万条数据,包含训练集和测试集。数据以实际提供为准。 训练数据集数据样本如下:
测试集数据样本不包含label字段。 为了保证比赛的公平性,本次比赛仅允许使用官方发布的数据和标注,否则比赛成绩将被视为无效。
代码
import os
import random
from functools import partial
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import paddle
import paddle as P
import paddle.nn.functional as F
import paddlenlp as ppnlp #===抱抱脸的transformers
import pandas as pd
from paddle.io import Dataset
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.datasets import MapDataset
from paddlenlp.transformers import LinearDecayWithWarmup
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import numpy as np
import paddle.fluid as fluid
import paddle.nn as nn
# =============================== 初始化 ========================
class Config:
text_col = 'text'
target_col = 'label'
# 最大长度大小
max_len = 256 # len(text) or toeknizer:256覆盖95% # 502
# 模型运行批处理大小
batch_size = 32
target_size = 25
seed = 71
n_fold = 5
# 训练过程中的最大学习率
learning_rate = 5e-5
# 训练轮次
epochs = 10 # 3
# 学习率预热比例
warmup_proportion = 0.1
# 权重衰减系数,类似模型正则项策略,避免模型过拟合
weight_decay = 0.01
model_name = "ernie-gram-zh"
print_freq = 100
def seed_torch(seed=42):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
def concat_text(row):
return str(row['name']) + ',' + row['content']
CFG = Config()
seed_torch(seed=CFG.seed)
# y = train[CFG.target_col]
# class_weight = 'balanced'
# classes = train[CFG.target_col].unique() # 标签类别
# weight = compute_class_weight(class_weight=class_weight,classes= classes, y=y)
# print(weight)
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/testa_nolabel.csv')
train.fillna('', inplace=True)
test.fillna('', inplace=True)
train['text'] = train.apply(lambda row: concat_text(row), axis=1)
test['text'] = test.apply(lambda row: concat_text(row), axis=1)
# CV split:5折 StratifiedKFold 分层采样
folds = train.copy()
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[CFG.target_col])):
folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
# ====================================== 数据集以及转换函数==============================
# Torch
class CustomDataset(Dataset):
def __init__(self, df):
self.data = df.values.tolist()
self.texts = df[CFG.text_col]
self.labels = df[CFG.target_col]
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
"""
索引数据
:param idx:
:return:
"""
text = str(self.texts[idx])
label = self.labels[idx]
example = {'text': text, 'label': label}
return example
def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
"""
创建Bert输入
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
Returns:
input_ids(obj:`list[int]`): The list of token ids.
token_type_ids(obj: `list[int]`): List of sequence pair mask.
label(obj:`numpy.array`, data type of int64, optional): The input label if not is_test.
"""
encoded_inputs = tokenizer(text=example["text"], max_seq_len=max_seq_length)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
if not is_test:
label = np.array([example["label"]], dtype="int64")
return input_ids, token_type_ids, label
else:
return input_ids, token_type_ids
def create_dataloader(dataset,
mode='train',
batch_size=1,
batchify_fn=None,
trans_fn=None):
if trans_fn:
dataset = dataset.map(trans_fn)
shuffle = True if mode == 'train' else False
if mode == 'train':
batch_sampler = paddle.io.DistributedBatchSampler(
dataset, batch_size=batch_size, shuffle=shuffle)
else:
batch_sampler = paddle.io.BatchSampler(
dataset, batch_size=batch_size, shuffle=shuffle)
return paddle.io.DataLoader(
dataset=dataset,
batch_sampler=batch_sampler,
collate_fn=batchify_fn,
return_list=True)
# tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained(CFG.model_name)
tokenizer = ppnlp.transformers.ErnieGramTokenizer.from_pretrained(CFG.model_name)
trans_func = partial(
convert_example,
tokenizer=tokenizer,
max_seq_length=CFG.max_len)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment
Stack(dtype="int64") # label
): [data for data in fn(samples)]
# ====================================== 训练、验证与预测函数 ==============================
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
"""
验证函数
"""
model.eval()
metric.reset()
losses = []
for batch in data_loader:
input_ids, token_type_ids, labels = batch
logits = model(input_ids, token_type_ids)
loss = criterion(logits, labels)
losses.append(loss.numpy())
correct = metric.compute(logits, labels)
metric.update(correct)
accu = metric.accumulate()
print("eval loss: %.5f, accu: %.5f" % (np.mean(losses), accu))
model.train()
metric.reset()
return accu
def predict(model, data, tokenizer, batch_size=1):
"""
预测函数
"""
examples = []
for text in data:
input_ids, segment_ids = convert_example(
text,
tokenizer,
max_seq_length=CFG.max_len,
is_test=True)
examples.append((input_ids, segment_ids))
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input id
Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment id
): fn(samples)
# Seperates data into some batches.
batches = []
one_batch = []
for example in examples:
one_batch.append(example)
if len(one_batch) == batch_size:
batches.append(one_batch)
one_batch = []
if one_batch:
# The last batch whose size is less than the config batch_size setting.
batches.append(one_batch)
results = []
model.eval()
for batch in tqdm(batches):
input_ids, segment_ids = batchify_fn(batch)
input_ids = paddle.to_tensor(input_ids)
segment_ids = paddle.to_tensor(segment_ids)
logits = model(input_ids, segment_ids)
probs = F.softmax(logits, axis=1)
results.append(probs.numpy())
return np.vstack(results)
def inference():
model_paths = [
'ernie-gram-zh_fold0.bin',
'ernie-gram-zh_fold1.bin',
'ernie-gram-zh_fold2.bin',
'ernie-gram-zh_fold3.bin',
'ernie-gram-zh_fold4.bin',
]
# model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained(CFG.model_name,
# num_classes=25)
model = ppnlp.transformers.ErnieGramForSequenceClassification.from_pretrained(CFG.model_name,
num_classes=25)
fold_preds = []
for model_path in model_paths:
model.load_dict(P.load(model_path))
pred = predict(model, test.to_dict(orient='records'), tokenizer, 16)
fold_preds.append(pred)
preds = np.mean(fold_preds, axis=0) # 五折概率进行平均
np.save("preds.npy",preds)
labels = np.argmax(preds, axis=1)
test['label'] = labels
test[['id', 'label']].to_csv('paddle.csv', index=None)
def train():
# ==================================== 交叉验证训练 ==========================
for fold in range(5):
print(f"===============training fold_nth:{fold + 1}======================")
trn_idx = folds[folds['fold'] != fold].index
val_idx = folds[folds['fold'] == fold].index
train_folds = folds.loc[trn_idx].reset_index(drop=True)
valid_folds = folds.loc[val_idx].reset_index(drop=True)
train_dataset = CustomDataset(train_folds)
train_ds = MapDataset(train_dataset)
dev_dataset = CustomDataset(valid_folds)
dev_ds = MapDataset(dev_dataset)
train_data_loader = create_dataloader(
train_ds,
mode='train',
batch_size=CFG.batch_size,
batchify_fn=batchify_fn,
trans_fn=trans_func)
dev_data_loader = create_dataloader(
dev_ds,
mode='dev',
batch_size=CFG.batch_size,
batchify_fn=batchify_fn,
trans_fn=trans_func)
model = ppnlp.transformers.ErnieGramForSequenceClassification.from_pretrained(CFG.model_name,
num_classes=25)
num_training_steps = len(train_data_loader) * CFG.epochs
lr_scheduler = LinearDecayWithWarmup(CFG.learning_rate, num_training_steps, CFG.warmup_proportion)
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=CFG.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()
global_step = 0
best_val_acc = 0
for epoch in range(1, CFG.epochs + 1):
for step, batch in enumerate(train_data_loader, start=1):
input_ids, segment_ids, labels = batch
logits = model(input_ids, segment_ids)
# probs_ = paddle.to_tensor(logits, dtype="float64")
loss = criterion(logits, labels)
probs = F.softmax(logits, axis=1)
correct = metric.compute(probs, labels)
metric.update(correct)
acc = metric.accumulate()
global_step += 1
if global_step % CFG.print_freq == 0:
print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (
global_step, epoch, step, loss, acc))
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_grad()
acc = evaluate(model, criterion, metric, dev_data_loader)
if acc > best_val_acc:
best_val_acc = acc
P.save(model.state_dict(), f'{CFG.model_name}_fold{fold}.bin')
print('Best Val acc %.5f' % best_val_acc)
del model
if __name__ == '__main__':
train()
inference()
# Focalloss
# class_weights
# ernie>chinese_roberta_wwm
# nezha
# 长句:对长句分句:样本:两个子句。 赛题任务文本长度