Notice
Recent Posts
Recent Comments
Link
«   2026/02   »
1 2 3 4 5 6 7
8 9 10 11 12 13 14
15 16 17 18 19 20 21
22 23 24 25 26 27 28
Tags
more
Archives
Today
Total
관리 메뉴

Silver bullet

양방향 LSTM을 이용한 개체명 인식 본문

AI/NLP - 자연어처리

양방향 LSTM을 이용한 개체명 인식

밀크쌀과자 2024. 7. 13. 09:34
import re
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np

file_path = tf.keras.utils.get_file("train.txt", 
    "https://raw.githubusercontent.com/Franck-Dernoncourt/NeuroNER/master/neuroner/data/conll2003/en/train.txt")
    
for line in open(file_path, 'r').readlines()[:20]:
    print(repr(line))
tagged_sentences = []
sentence = []

for line in open(file_path, 'r'):
    if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
        if len(sentence) > 0:
            tagged_sentences.append(sentence)
            sentence = []
        continue
        
    word, pos_tag, chunk_tag, ner = line.split(' ')   # 공백을 기준으로 속성을 구분 
    ner = re.sub('\n', '', ner)         #['German', 'JJ', 'B-NP', 'B-MISC\n'] 에서 줄바꿈 제거
    word = word.lower()                         # 단어들은 소문자로 바꿔서 저장
    sentence.append((word, ner))                # 단어와 개체명 태깅만 사용
    
print("전체 샘플 개수: ", len(tagged_sentences)) # 전체 샘플의 개수 출력
tagged_sentences[:3]
inputs, labels =[], [] 

for pairs in tagged_sentences:
    words, tags = zip(*pairs)
    inputs.append(list(words))
    labels.append(list(tags))

# Let's see how a sequence looks
print(inputs[0])
print(labels[0])

print('샘플의 최대 길이 : {}'.format(max([len(w) for w in inputs])))
print('샘플의 평균 길이 : {:4f}'.format(np.mean([len(w) for w in inputs])))
plt.hist([len(s) for s in inputs], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

MAX_LENGTH = 60

tokenizer = Tokenizer()
tokenizer.fit_on_texts(inputs)

print(len(tokenizer.word_index))

MAX_WORDS = 4000

train_sentences, test_sentences, train_tags, test_tags \
                = train_test_split(inputs, labels, test_size=0.2)
len(train_sentences), len(test_sentences), len(train_tags), len(test_tags)
entity_tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
entity_tokenizer.fit_on_texts(train_sentences)

tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(labels)

vocab_size = entity_tokenizer.num_words + 1      # MAX_WORDS
tag_size = len(tag_tokenizer.word_index) + 1       #전체 word_index 갯수

print('단어 집합의 크기 : {}'.format(vocab_size))
print('개체명 태깅 정보 집합의 크기 : {}'.format(tag_size))

print(tag_tokenizer.word_index)
X_train = entity_tokenizer.texts_to_sequences(train_sentences)
y_train = tag_tokenizer.texts_to_sequences(train_tags)

X_test = entity_tokenizer.texts_to_sequences(test_sentences)
y_test = tag_tokenizer.texts_to_sequences(test_tags)

len(X_train), len(y_train), len(X_test), len(y_test)

X_train[0], y_train[0]

X_train_padded = pad_sequences(X_train, maxlen=MAX_LENGTH, padding='post')
X_test_padded = pad_sequences(X_test, maxlen=MAX_LENGTH, padding='post')
y_train_padded = pad_sequences(y_train, maxlen=MAX_LENGTH, padding='post')
y_test_padded  = pad_sequences(y_test, maxlen=MAX_LENGTH, padding='post')
 
print(X_train_padded[0])
print(X_test_padded[0])
print(y_train_padded[0])
print(y_test_padded[0])

y_train_onehot = to_categorical(y_train_padded, tag_size)
y_test_onehot = to_categorical(y_test_padded, tag_size)
model = Sequential()
model.add(Embedding(vocab_size, 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dense(tag_size, activation='softmax'))
 
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
 
model.summary()

model.fit(X_train_padded, y_train_onehot , batch_size=128, epochs=10, 
              validation_data=(X_test_padded, y_test_onehot))
              
scores = model.evaluate(X_test_padded, y_test_onehot, verbose=0)
print(f"{model.metrics_names[1]}: {scores[1] * 100}")

98

test_sample = ["EU gave German call to take British people"]

test_sample_tokenized = entity_tokenizer.texts_to_sequences(test_sample)
test_sample_padded = pad_sequences(test_sample_tokenized, maxlen=MAX_LENGTH, padding='post')
test_sample_padded

index2word =entity_tokenizer.index_word
index2tag = tag_tokenizer.index_word

y_predicted = model.predict(test_sample_padded)
y_pred = y_predicted.argmax(axis=-1)
y_pred

test_sample_tokenized

for i in range(len(test_sample_tokenized)):
    for word, tag in zip([index2word.get(x, '?') for x in test_sample_tokenized[i]], 
                                  [index2tag.get(y, '?') for y in y_pred[i]]):
        if word != '<OOV>' and word != '?' and tag !='?':
            print(f'{word} :  {tag.upper()}')