import pandas as pd
import time
start = time.clock()
def getTestData(n,path,topath):
data = pd.read_excel(path)
intents = list(data['intent'].unique())
testdata = pd.DataFrame()
for intent in intents:
data1 = data[data['intent'] == intent]
if len(data1)>30:
data1=data1.sample(n=30,replace=True)
testdata = pd.concat([testdata, data1])
testdata.to_excel(topath,index=False)
import os
p=r"D:\Users\LIUQIYUAN125\Desktop\新建文件夹" #意图库所在位置
wendang=os.listdir(p) #p下不能出现文件夹
os.chdir(p)
test_path=r"D:\Users\LIUQIYUAN125\Desktop\测试集\test_201906" #original测试集所在位置
if __name__=="__main__":
n=30 #每个意图抽取的话术个数
for i in wendang:
i_topath=i.replace('intent','original_test') #intent 这个根据意图库具体名称可能变更为“入库数据”
#topath=i_topath
topath=test_path+"/"+i_topath
#topath=os.path.join(path,i_topath)
#print(type())
#topath=r"\\test\\"+topath
getTestData(n,i,topath)
elapsed = (time.clock() - start)
print("Time used:",elapsed)