实战案例:微博情感分析
数据:每个文本文件包含相应类的数据
0:喜悦;1:愤怒;2:厌恶;3:低落
步骤
- 文本读取
- 分割训练集、测试集
- 特征提取
- 模型训练、预测
代码:
tools.py
import re
import jieba.posseg as pseg
import pandas as pd
import math
import numpy as np
stopwords1 = [line.rstrip() for line in open('./中文停用词库.txt', 'r', encoding='utf-8')]
stopwords = stopwords1
def proc_text(raw_line):
""" 处理每行的文本数据 返回分词结果 """
filter_pattern = re.compile('[^\u4E00-\u9FD5]+')
chinese_only = filter_pattern.sub('', raw_line)
words_lst = pseg.cut(chinese_only)
meaninful_words = []
for word, flag in words_lst:
if word not in stopwords:
meaninful_words.append(word)
return ' '.join(meaninful_words)
def split_train_test(text_df, size=0.8):
""" 分割训练集和测试集 """
train_text_df = pd.DataFrame()
test_text_df = pd.DataFrame()
labels = [0, 1, 2, 3]
for label in labels:
text_df_w_label = text_df[text_df['label'] == label]
text_df_w_label = text_df_w_label.reset_index()
n_lines = text_df_w_label.shape[0]
split_line_no = math.floor(n_lines * size)
text_df_w_label_train = text_df_w_label.iloc[:split_line_no, :]
text_df_w_label_test = text_df_w_label.iloc[split_line_no:, :]
train_text_df = train_text_df.append(text_df_w_label_train)
test_text_df = test_text_df.append(text_df_w_label_test)
train_text_df = train_text_df.reset_index()
test_text_df = test_text_df.reset_index()
return train_text_df, test_text_df
def get_word_list_from_data(text_df):
""" 将数据集中的单词放入到一个列表中 """
word_list = []
for _, r_data in text_df.iterrows():
word_list += r_data['text'].split(' ')
return word_list
def extract_feat_from_data(text_df, text_collection, common_words_freqs):
""" 特征提取 """
n_sample = text_df.shape[0]
n_feat = len(common_words_freqs)
common_words = [word for word, _ in common_words_freqs]
X = np.zeros([n_sample, n_feat])
y = np.zeros(n_sample)
print('提取特征...')
for i, r_data in text_df.iterrows():
if (i + 1) % 5000 == 0:
print('已完成{}个样本的特征提取'.format(i + 1))
text = r_data['text']
feat_vec = []
for word in common_words:
if word in text:
tf_idf_val = text_collection.tf_idf(word, text)
else:
tf_idf_val = 0
feat_vec.append(tf_idf_val)
X[i, :] = np.array(feat_vec)
y[i] = int(r_data['label'])
return X, y
def cal_acc(true_labels, pred_labels):
""" 计算准确率 """
n_total = len(true_labels)
correct_list = [true_labels[i] == pred_labels[i] for i in range(n_total)]
acc = sum(correct_list) / n_total
return acc
main.py
import os
import pandas as pd
import nltk
from tools import proc_text, split_train_test, get_word_list_from_data, \
extract_feat_from_data, cal_acc
from nltk.text import TextCollection
from sklearn.naive_bayes import GaussianNB
dataset_path = './dataset'
text_filenames = ['0_simplifyweibo.txt', '1_simplifyweibo.txt',
'2_simplifyweibo.txt', '3_simplifyweibo.txt']
output_text_filename = 'raw_weibo_text.csv'
output_cln_text_filename = 'clean_weibo_text.csv'
is_first_run = True
def read_and_save_to_csv():
""" 读取原始文本数据,将标签和文本数据保存成csv """
text_w_label_df_lst = []
for text_filename in text_filenames:
text_file = os.path.join(dataset_path, text_filename)
label = int(text_filename[0])
with open(text_file, 'r', encoding='utf-8') as f:
lines = f.read().splitlines()
labels = [label] * len(lines)
text_series = pd.Series(lines)
label_series = pd.Series(labels)
text_w_label_df = pd.concat([label_series, text_series], axis=1)
text_w_label_df_lst.append(text_w_label_df)
result_df = pd.concat(text_w_label_df_lst, axis=0)
result_df.columns = ['label', 'text']
result_df.to_csv(os.path.join(dataset_path, output_text_filename),
index=None, encoding='utf-8')
def run_main():
""" 主函数 """
if is_first_run:
print('处理清洗文本数据中...', end=' ')
read_and_save_to_csv()
text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
encoding='utf-8')
text_df['text'] = text_df['text'].apply(proc_text)
text_df = text_df[text_df['text'] != '']
text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
index=None, encoding='utf-8')
print('完成,并保存结果。')
print('加载处理好的文本数据')
clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
encoding='utf-8')
train_text_df, test_text_df = split_train_test(clean_text_df)
print('训练集中各类的数据个数:', train_text_df.groupby('label').size())
print('测试集中各类的数据个数:', test_text_df.groupby('label').size())
n_common_words = 200
print('统计词频...')
all_words_in_train = get_word_list_from_data(train_text_df)
fdisk = nltk.FreqDist(all_words_in_train)
common_words_freqs = fdisk.most_common(n_common_words)
print('出现最多的{}个词是:'.format(n_common_words))
for word, count in common_words_freqs:
print('{}: {}次'.format(word, count))
print()
text_collection = TextCollection(train_text_df['text'].values.tolist())
print('训练样本提取特征...', end=' ')
train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs)
print('完成')
print()
print('测试样本提取特征...', end=' ')
test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs)
print('完成')
print('训练模型...', end=' ')
gnb = GaussianNB()
gnb.fit(train_X, train_y)
print('完成')
print()
print('测试模型...', end=' ')
test_pred = gnb.predict(test_X)
print('完成')
print('准确率:', cal_acc(test_y, test_pred))
if __name__ == '__main__':
run_main()