微信扫码
与创始人交个朋友
我要投稿
大模型带火了太多项目了,最近大火的graphrag,各个关键环节都离不开llm,但是至于效果怎样,就有待商榷了,就拿最近的graphrag的知识图谱的部分,他的提取方式还是靠大模型的pompt设计能力,我测试了一些文本,从知识图谱提取方面看,效果确实有待提高。或许大家都忘了,在大模型出来之前,称霸nlp界的是bert,这次给大家分享一个专门做知识图谱提取的方式,使用bert的关系抽取,废话不多说,上干货。
工具
本次使用 bert做关系抽取,使用的python包是 bert4keras,当然还有一个bert4torch,如果大家有基础可以直接去github搜这两个项目,去学习一下,预训练模型,使用的 chinese_L-12_H-768_A-12,这个是谷歌开源的bert预训练权重,本次是基于这个权重微调,当然bert的权重有很多,也有比这个好的,大家可以自行尝试
数据
数据使用的之前的竞赛的数据,具体忘了,哈哈,数据提供了两部分,一个是关系说明数据
{"event_type": "财经/交易-出售/收购", "role_list": [{"role": "时间"}, {"role": "出售方"}, {"role": "交易物"}, {"role": "出售价格"}, {"role": "收购方"}], "id": "804336473abe8b8124d00876a5387151", "class": "财经/交易"}{"event_type": "财经/交易-跌停", "role_list": [{"role": "时间"}, {"role": "跌停股票"}], "id": "29a8f7417bf8867ddb8521f647a828d8", "class": "财经/交易"}
event_type 是关系类型,其他的是要提取的实体类型
训练数据
{"text": "雀巢裁员4000人:时代抛弃你时,连招呼都不会打!", "id": "409389c96efe78d6af1c86e0450fd2d7", "event_list": [{"event_type": "组织关系-裁员", "trigger": "裁员", "trigger_start_index": 2, "arguments": [{"argument_start_index": 0, "role": "裁员方", "argument": "雀巢", "alias": []}, {"argument_start_index": 4, "role": "裁员人数", "argument": "4000人", "alias": []}], "class": "组织关系"}]}{"text": "美国“未来为”子公司大幅度裁员,这是为什么呢?任正非正式回应", "id": "5aec2b5b759c5f8f42f9c0156eb3c924", "event_list": [{"event_type": "组织关系-裁员", "trigger": "裁员", "trigger_start_index": 13, "arguments": [{"argument_start_index": 0, "role": "裁员方", "argument": "美国“未来为”子公司", "alias": []}], "class": "组织关系"}]}
数据包含,要抽取关系的文本,以及抽取到的实体
数据分成训练、测试、验证三份数据,我会给到大家
具体代码
代码算是比价简洁的了,整体代码在文末,这里分开简单讲解一下
训练代码
加载训练用的包和预训练模型,以及一些训练参数
#! -*- coding: utf-8 -*-
import json
import numpy as np
from bert4keras.backend import keras, K, search_layer
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from bert4keras.layers import ConditionalRandomField
from keras.layers import Dense
from keras.models import Model
from tqdm import tqdm
import pylcs
from keras.layers import Input, Dense, Embedding, LSTM, Bidirectional,GRU
from keras.optimizers import Optimizer
from tensorflow.keras.callbacks import LearningRateScheduler
import tensorflow as tf
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'] = '1'
# 基本信息
train=True #参数为True代表训练 参数为False代表为预测
maxlen = 220
epochs = 10
batch_size = 16
learning_rate = 1e-5
crf_lr_multiplier = 100# 必要时扩大CRF层的学习率
model_save='best_model_v4.weights'#最好的成绩是模型2
# bert配置
config_path = 'data/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'data/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'data/chinese_L-12_H-768_A-12/vocab.txt'
加载训练数据
def load_data(filename):
D = []
with open(filename) as f:
for l in f:
l = json.loads(l)
arguments = {}
for event in l['event_list']:
for argument in event['arguments']:
key = argument['argument']
value = (event['event_type'], argument['role'])
arguments[key] = value
D.append((l['text'], arguments))
print(D)
return D
# 读取数据
train_data = load_data('data/duee_train.json')
valid_data = load_data('data/duee_dev.json')
# 读取schema
with open('data/duee_event_schema.json') as f:
id2label, label2id, n = {}, {}, 0
for l in f:
l = json.loads(l)
for role in l['role_list']:
key = (l['event_type'], role['role'])
id2label[n] = key
label2id[key] = n
n += 1
num_labels = len(id2label) * 2 + 1
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
构造数据生成器,给bert喂训练数据
def search(pattern, sequence):
"""从sequence中寻找子串pattern
如果找到,返回第一个下标;否则返回-1。
"""
n = len(pattern)
for i in range(len(sequence)):
if sequence[i:i + n] == pattern:
return i
return -1
class data_generator(DataGenerator):
"""数据生成器
"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, (text, arguments) in self.sample(random):
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
labels = [0] * len(token_ids)
for argument in arguments.items():
a_token_ids = tokenizer.encode(argument[0])[0][1:-1]
start_index = search(a_token_ids, token_ids)
if start_index != -1:
labels[start_index] = label2id[argument[1]] * 2 + 1
for i in range(1, len(a_token_ids)):
labels[start_index + i] = label2id[argument[1]] * 2 + 2
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append(labels)
#print(batch_labels)
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
构造模型结构
model = build_transformer_model(
config_path,
checkpoint_path,
model='bert'
)
#lstm_output = Bidirectional(LSTM(num_labels//2, dropout=0.2, return_sequences=True))(model.output)
output = Dense(num_labels)(model.output)
CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
output = CRF(output)
model = Model(model.input, output)
model.summary()
opt =Adam(learning_rate)#AccumOptimizer(Adam(learning_rate), 5) # 10是累积步数
#opt = tf.keras.optimizers.Adam(learning_rate)
#opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(
#opt,
#loss_scale='dynamic')
model.compile(
loss=CRF.sparse_loss,
optimizer=opt,
metrics=[CRF.sparse_accuracy]
)
解码 训练后的结果
def viterbi_decode(nodes, trans):
"""Viterbi算法求最优路径
其中nodes.shape=[seq_len, num_labels],
trans.shape=[num_labels, num_labels].
"""
labels = np.arange(num_labels).reshape((1, -1))
scores = nodes[0].reshape((-1, 1))
scores[1:] -= np.inf# 第一个标签必然是0
paths = labels
for l in range(1, len(nodes)):
M = scores + trans + nodes[l].reshape((1, -1))
idxs = M.argmax(0)
scores = M.max(0).reshape((-1, 1))
paths = np.concatenate([paths[:, idxs], labels], 0)
return paths[:, scores[:, 0].argmax()]
def extract_arguments(text):
"""arguments抽取函数
"""
tokens = tokenizer.tokenize(text)
while len(tokens) > 510:
tokens.pop(-2)
mapping = tokenizer.rematch(text, tokens)
token_ids = tokenizer.tokens_to_ids(tokens)
segment_ids = [0] * len(token_ids)
nodes = model.predict([[token_ids], [segment_ids]])[0]
trans = K.eval(CRF.trans)
labels = viterbi_decode(nodes, trans)
arguments, starting = [], False
for i, label in enumerate(labels):
if label > 0:
if label % 2 == 1:
starting = True
arguments.append([[i], id2label[(label - 1) // 2]])
elif starting:
arguments[-1][0].append(i)
else:
starting = False
else:
starting = False
return {
text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1]: l
for w, l in arguments
}
验证指标的函数
def evaluate(data):
"""评测函数(跟官方评测结果不一定相同,但很接近)
"""
X, Y, Z = 1e-10, 1e-10, 1e-10
for text, arguments in tqdm(data):
inv_arguments = {v: k for k, v in arguments.items()}
pred_arguments = extract_arguments(text)
pred_inv_arguments = {v: k for k, v in pred_arguments.items()}
Y += len(pred_inv_arguments)
Z += len(inv_arguments)
for k, v in pred_inv_arguments.items():
if k in inv_arguments:
# 用最长公共子串作为匹配程度度量
l = pylcs.lcs(v, inv_arguments[k])
X += 2. * l / (len(v) + len(inv_arguments[k]))
f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
return f1, precision, recall
预测文件输出
def predict_to_file(in_file, out_file):
"""预测结果到文件,方便提交
"""
fw = open(out_file, 'w', encoding='utf-8')
with open("/home/maxin/baidu_data/baidu_ee/event_schema.json", 'r', encoding='UTF-8') as x:
event_dict = json.load(x)
x.close()
with open(in_file) as fr:
for l in tqdm(fr):
l = json.loads(l)
arguments = extract_arguments(l['text'])
event_list = []
for k, v in arguments.items():
event_list.append({
'event_type': v[0],
'trigger':v[0].split('-')[-1],
'trigger_start_index':0,
'arguments': [{
'argument_start_index':0,
'role': v[1],
'argument': k,
'class':event_dict.get(v[0])
}]
})
l['event_list'] = event_list
l = json.dumps(l, ensure_ascii=False)
fw.write(l + '\n')
fw.close()
class Evaluator(keras.callbacks.Callback):
"""评估和保存模型
"""
def __init__(self):
self.best_val_f1 = 0.
def on_epoch_end(self, epoch, logs=None):
f1, precision, recall = evaluate(valid_data)
if f1 >= self.best_val_f1:
self.best_val_f1 = f1
model.save_weights(model_save)
print(
'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
(f1, precision, recall, self.best_val_f1)
)
53AI,企业落地应用大模型首选服务商
产品:大模型应用平台+智能体定制开发+落地咨询服务
承诺:先做场景POC验证,看到效果再签署服务协议。零风险落地应用大模型,已交付160+中大型企业
2025-01-08
使用大语言模型从零构建知识图谱(上)
2025-01-06
搭建一个本地AI知识库需要用到哪些技术栈?
2025-01-04
吴恩达DeepLearning.AI课程系列 - 大模型检索增强生成(四):向量数据库中的检索优化
2025-01-04
实战Milvus 2.5:语义检索VS全文检索VS混合检索
2025-01-04
什么是元数据
2025-01-03
一位海外开发者的 OpenSPG KAG 实践分享
2025-01-03
Knowledge Graph Studio:让知识图谱构建更简单、更智能
2025-01-02
NebulaGraph + LLM 处理风控知识图谱的探索和实践
2025-01-02
2025-01-03
2024-07-17
2024-07-11
2024-08-13
2024-07-13
2024-06-24
2024-07-12
2024-06-10
2024-08-27
2025-01-06
2025-01-02
2024-12-16
2024-12-10
2024-12-04
2024-12-01
2024-11-30
2024-11-22