'''
-*- conding: utf-8 -*-
统计生产数据中的话术中各敏感词出现的频率。
'''
import pandas as pd
from functools import cmp_to_key
from datetime import datetime
from tqdm import tqdm
starttime = datetime.now()
infile1 = "./input/input1/" + "生产数据.xlsx"
infile2 = "./input/input1/" + "敏感词.xlsx"
outfile = "./output/output1/" + "统计结果.xlsx"
df1 = pd.read_excel(infile1)
df2 = pd.read_excel(infile2).set_index('user_say')
print("\n数据读取完成")
def compare(a, b):
if len(a) > len(b):
return 1
elif len(a) < len(b):
return -1
else:
return 0
grouped = df1.groupby('匹配扩展问')
df_out = pd.DataFrame()
for user_say2, data in tqdm(grouped):
new_row = [user_say2]
sensitive_words = df2.loc[user_say2].dropna()
sensitive_words = sensitive_words.tolist()
sensitive_words.sort(key=cmp_to_key(compare), reverse=True)
for word in sensitive_words:
count = 0
for i in range(len(data)):
user_say1 = data.iloc[i]['会话内容']
if word in user_say1:
count += 1
user_say1 = user_say1.replace(word, '')
data.iloc[i]['会话内容'] = user_say1
new_row.append(word)
new_row.append(str(count))
new_row = pd.DataFrame([new_row])
df_out = df_out.append(new_row)
print("\n敏感词统计完成")
df_out.reset_index(drop=True, inplace=True)
df_out.to_excel(outfile)
print("\n用时:", datetime.now() - starttime)