Silver bullet
양방향 LSTM을 이용한 개체명 인식 본문
import re
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
file_path = tf.keras.utils.get_file("train.txt",
"https://raw.githubusercontent.com/Franck-Dernoncourt/NeuroNER/master/neuroner/data/conll2003/en/train.txt")
for line in open(file_path, 'r').readlines()[:20]:
print(repr(line))
tagged_sentences = []
sentence = []
for line in open(file_path, 'r'):
if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
if len(sentence) > 0:
tagged_sentences.append(sentence)
sentence = []
continue
word, pos_tag, chunk_tag, ner = line.split(' ') # 공백을 기준으로 속성을 구분
ner = re.sub('\n', '', ner) #['German', 'JJ', 'B-NP', 'B-MISC\n'] 에서 줄바꿈 제거
word = word.lower() # 단어들은 소문자로 바꿔서 저장
sentence.append((word, ner)) # 단어와 개체명 태깅만 사용
print("전체 샘플 개수: ", len(tagged_sentences)) # 전체 샘플의 개수 출력
tagged_sentences[:3]
inputs, labels =[], []
for pairs in tagged_sentences:
words, tags = zip(*pairs)
inputs.append(list(words))
labels.append(list(tags))
# Let's see how a sequence looks
print(inputs[0])
print(labels[0])
print('샘플의 최대 길이 : {}'.format(max([len(w) for w in inputs])))
print('샘플의 평균 길이 : {:4f}'.format(np.mean([len(w) for w in inputs])))
plt.hist([len(s) for s in inputs], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()
MAX_LENGTH = 60
tokenizer = Tokenizer()
tokenizer.fit_on_texts(inputs)
print(len(tokenizer.word_index))
MAX_WORDS = 4000
train_sentences, test_sentences, train_tags, test_tags \
= train_test_split(inputs, labels, test_size=0.2)
len(train_sentences), len(test_sentences), len(train_tags), len(test_tags)
entity_tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
entity_tokenizer.fit_on_texts(train_sentences)
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(labels)
vocab_size = entity_tokenizer.num_words + 1 # MAX_WORDS
tag_size = len(tag_tokenizer.word_index) + 1 #전체 word_index 갯수
print('단어 집합의 크기 : {}'.format(vocab_size))
print('개체명 태깅 정보 집합의 크기 : {}'.format(tag_size))
print(tag_tokenizer.word_index)
X_train = entity_tokenizer.texts_to_sequences(train_sentences)
y_train = tag_tokenizer.texts_to_sequences(train_tags)
X_test = entity_tokenizer.texts_to_sequences(test_sentences)
y_test = tag_tokenizer.texts_to_sequences(test_tags)
len(X_train), len(y_train), len(X_test), len(y_test)
X_train[0], y_train[0]
X_train_padded = pad_sequences(X_train, maxlen=MAX_LENGTH, padding='post')
X_test_padded = pad_sequences(X_test, maxlen=MAX_LENGTH, padding='post')
y_train_padded = pad_sequences(y_train, maxlen=MAX_LENGTH, padding='post')
y_test_padded = pad_sequences(y_test, maxlen=MAX_LENGTH, padding='post')
print(X_train_padded[0])
print(X_test_padded[0])
print(y_train_padded[0])
print(y_test_padded[0])
y_train_onehot = to_categorical(y_train_padded, tag_size)
y_test_onehot = to_categorical(y_test_padded, tag_size)
model = Sequential()
model.add(Embedding(vocab_size, 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dense(tag_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
model.summary()
model.fit(X_train_padded, y_train_onehot , batch_size=128, epochs=10,
validation_data=(X_test_padded, y_test_onehot))
scores = model.evaluate(X_test_padded, y_test_onehot, verbose=0)
print(f"{model.metrics_names[1]}: {scores[1] * 100}")
98
test_sample = ["EU gave German call to take British people"]
test_sample_tokenized = entity_tokenizer.texts_to_sequences(test_sample)
test_sample_padded = pad_sequences(test_sample_tokenized, maxlen=MAX_LENGTH, padding='post')
test_sample_padded
index2word =entity_tokenizer.index_word
index2tag = tag_tokenizer.index_word
y_predicted = model.predict(test_sample_padded)
y_pred = y_predicted.argmax(axis=-1)
y_pred
test_sample_tokenized
for i in range(len(test_sample_tokenized)):
for word, tag in zip([index2word.get(x, '?') for x in test_sample_tokenized[i]],
[index2tag.get(y, '?') for y in y_pred[i]]):
if word != '<OOV>' and word != '?' and tag !='?':
print(f'{word} : {tag.upper()}')'AI > NLP - 자연어처리' 카테고리의 다른 글
| 영화평 Text 분류 - Sentiment Analysis / IMDB_movie_review & 네이버 영화평 감성 분류 (1) | 2024.07.13 |
|---|