敏感词查找
'''
-- conding: utf-8 --
从input2中的入库数据中查找敏感词,输出包含敏感词的话术和对应的敏感词。
'''
import pandas as pd
from functools import cmp_to_key
from datetime import datetime
from tqdm import tqdm
import os
starttime = datetime.now()
infile1 = "./input/input2/"
infile2 = "./input/input2/" + "敏感词.xlsx"
outfile = "./output/output2/" + "查找结果.xlsx"
df2 = pd.read_excel(infile2)
print("\n敏感词读取完成")
def compare(a, b):
if len(a) > len(b):
return 1
elif len(a) < len(b):
return -1
else:
return 0
sensitive_words = df2['敏感词'].tolist()
sensitive_words.sort(key=cmp_to_key(compare), reverse=True)
df_out = pd.DataFrame()
infiles = [infile1 + i for i in os.listdir(infile1) if i.endswith('入库数据.xlsx')]
for infile in infiles:
df1 = pd.read_excel(infile)
basename = os.path.basename(infile)
print("处理" + basename)
scene = basename.split('_')[0]
for i in tqdm(range(len(df1))):
word_contained = []
user_say = str(df1.iloc[i]['user_say'])
for word in sensitive_words:
if word in user_say:
word_contained.append(word)
user_say = user_say.replace(word, '')
if len(word_contained) > 0:
word_contained_str = ', '.join(word_contained)
new_row = [df1.iloc[i]['user_say'], df1.iloc[i]['intent'], word_contained_str, scene]
new_row = pd.DataFrame([new_row])
df_out = df_out.append(new_row)
print("\n敏感词查找完成")
df_out.reset_index(drop=True, inplace=True)
df_out.columns = ['user_say', 'intent', 'sensitive_words', 'scene']
df_out.to_excel(outfile)
print("\n用时:", datetime.now() - starttime)