Notice
Recent Posts
Recent Comments
Link
«   2026/02   »
1 2 3 4 5 6 7
8 9 10 11 12 13 14
15 16 17 18 19 20 21
22 23 24 25 26 27 28
Tags
more
Archives
Today
Total
관리 메뉴

Silver bullet

영화평 Text 분류 - Sentiment Analysis / IMDB_movie_review & 네이버 영화평 감성 분류 본문

AI/NLP - 자연어처리

영화평 Text 분류 - Sentiment Analysis / IMDB_movie_review & 네이버 영화평 감성 분류

밀크쌀과자 2024. 7. 13. 09:25

IMDB_movie_review  영화평 Text 분류 - Sentiment Analysis

IMDB (Internet Movie Database, https://www.imdb.com/) Dataset
각 25,000 개의 training/testing set 으로 구성된 IMDB 영화관람평
“imdb_reviews” – encoding 되어있지 않은 string 형태의 data
label : positive, negative binary classification
 import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

train_dataset, test_dataset = dataset['train'], dataset['test']

len(train_dataset), len(test_dataset)
train_sentences = []
train_labels = []
test_sentences = []
test_labels = []

for sent, label in train_dataset:
    train_sentences.append(str(sent.numpy()))
    train_labels.append(label.numpy())

for sent, label in test_dataset:
    test_sentences.append(str(sent.numpy()))
    test_labels.append(label.numpy())

print(train_labels[-1])
print(train_sentences[-1])

print(test_labels[-1])
print(test_sentences[-1])
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

print(train_labels.shape)
print(test_labels.shape)
vocab_size = 10000

tokenizer = Tokenizer(num_words = vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(train_sentences)

train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

print(train_sequences[0])
print(test_sequences[0])

plt.hist([len(s) for s in train_sequences] + [len(s) for s in test_sequences], bins=50);
max_length = 150

train_padded = pad_sequences(train_sequences,maxlen=max_length, truncating='post', padding='post')

test_padded = pad_sequences(test_sequences,maxlen=max_length, truncating='post', padding='post')

print(train_padded.shape)
print(test_padded.shape)
print(train_padded[0])
print(test_padded[0])

- sequence data 를 sentence 로 reverse conversion

reverse_word_index = dict([(value, key) for (key, value) in tokenizer.word_index.items()])

def decode_review(sequence):
    return ' '.join([tokenizer.index_word.get(i, '<pad>') for i in sequence])

print(decode_review(train_padded[0]))
print()
print(train_sentences[0])

model define

model = Sequential([
    Embedding(vocab_size+1, 64),
    Bidirectional(tf.keras.layers.LSTM(64)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
model.summary()

%%time
num_epochs = 30
history = model.fit(train_padded, train_labels, epochs=num_epochs, batch_size=128,
                validation_data=(test_padded, test_labels), verbose=1)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.plot(history.history['accuracy'])
ax1.plot(history.history['val_accuracy'])
ax1.set_xlabel('Epochs')
ax1.set_ylabel('accuracy')
ax1.legend(['accuarcy', 'val_accuracy'])

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
ax2.set_xlabel('Epochs')
ax2.set_ylabel('loss')
ax2.legend(['loss', 'val_loss'])
plt.show()

네이버 영화평 감성 분류

!pip install -q KoNLPy
import numpy as np
import pandas as pd
import re
import time
import matplotlib.pyplot as plt

from konlpy.tag import Okt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, Dense, LSTM

DATA_TRAIN_PATH = tf.keras.utils.get_file("ratings_train.txt", 
                        "https://github.com/ironmanciti/NLP_lecture/raw/master/data/naver_movie/ratings_train.txt")
DATA_TEST_PATH = tf.keras.utils.get_file("ratings_test.txt", 
                        "https://github.com/ironmanciti/NLP_lecture/raw/master/data/naver_movie/ratings_test.txt")
                        
train_data = pd.read_csv(DATA_TRAIN_PATH, delimiter='\t')
print(train_data.shape)
train_data.head()

test_data = pd.read_csv(DATA_TEST_PATH, delimiter='\t')
print(test_data.shape)
test_data.head()

# 훈련시간 감안하여 데이터 축소 (안해도 됨)
train_data = train_data.sample(n=50000, random_state=1)
test_data = test_data.sample(n=5000, random_state=1)

print(train_data.shape)
print(test_data.shape)

null value 제거

train_data.dropna(inplace=True)

test_data.dropna(inplace=True)

train_data.isnull().sum(), test_data.isnull().sum()
okt = Okt()
test = "아버지가방에들어가신다"
okt.morphs(test, stem=True)

text 전처리 - 한글 문자가 아닌 것 모두 제거

def preprocessing(sentence, remove_stopwords=True):
    # 불용어 제거
    #stop_words = set(['에', '은', '는', '이', '가', '그리고', '것', '들', '수', '등', '로', '을', '를', '만', '도', '아', '의', '그', '다'])
    stop_words = []
    
    sentence = re.sub('\\\\n', ' ', sentence)              # 개행문자 제거
    sentence = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣ ]', '', sentence)  #한글외에 모두 제거
    sentence = okt.morphs(sentence, stem=True)
    if remove_stopwords:
        sentence = [token for token in sentence if not token in stop_words]
    return sentence
    
    
%%time
train_sentences = []
train_labels = []
test_sentences = []
test_labels = []

start = time.time()

for i, (sent, label) in enumerate(zip(train_data['document'], train_data['label'])):
    if i % 10000 == 0:
        print(f"Train processed = {i}")
    sent = preprocessing(sent)
    if len(sent) > 0:
        train_sentences.append(sent)
        train_labels.append(label)

for i, (sent, label) in enumerate(zip(test_data['document'], test_data['label'])):
    if i % 1000 == 0:
        print(f"Test processed = {i}")
    sent = preprocessing(sent)
    if len(sent) > 0:
        test_sentences.append(sent)
        test_labels.append(label)
    
print(time.time() - start)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

print(train_labels.shape)
print(test_labels.shape)
VOCAB_SIZE = 20000

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentences)

train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

print(train_sequences[0])
print(test_sequences[0])

plt.hist([len(s) for s in train_sequences] + [len(s) for s in test_sequences], bins=30);
max_length = 15

train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')

test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

print(train_padded.shape)
print(test_padded.shape)
print(train_padded[0])
print(test_padded[0])
reverse_word_index = dict([(v, k) for (k, v) in tokenizer.word_index.items()])

def decode_sentence(sequence):
    return ' '.join([reverse_word_index.get(i, '?') for i in sequence])

print(decode_sentence(train_padded[4]))
print()
print(train_sentences[4])
model = Sequential([
    Embedding(VOCAB_SIZE+1, 64),
    Bidirectional(LSTM(64)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
num_epochs = 30
history = model.fit(train_padded, train_labels, epochs=num_epochs, batch_size=128, 
                    validation_data=(test_padded, test_labels), verbose=1)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.plot(history.history['accuracy'])
ax1.plot(history.history['val_accuracy'])
ax1.set_xlabel('Epochs')
ax1.set_ylabel('accuracy')
ax1.legend(['accuarcy', 'val_accuracy'])

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
ax2.set_xlabel('Epochs')
ax2.set_ylabel('loss')
ax2.legend(['loss', 'val_loss'])
plt.show()
sample_text = ['이 영화는 정말 짜증나서 못 보겠다']
# sample_text = ['오랜만에 접한 수작']
sample_seq = tokenizer.texts_to_sequences(sample_text)
sample_padded = pad_sequences(sample_seq, maxlen=max_length, padding='post')
sample_padded
model.predict([sample_padded])

['positive' if model.predict([sample_padded]) >= 0.5 else 'negative']

'AI > NLP - 자연어처리' 카테고리의 다른 글

양방향 LSTM을 이용한 개체명 인식  (0) 2024.07.13