2019大数据挑战赛(特征工程部分代码)
网址:2019大数据挑战赛
一些感想:
大数据挑战赛比赛还是挺有含金量的,从参加的人数来说总人数将近5000,多数为高校在读研究生,比赛分预选赛 -> 初赛 -> 复赛 -> 决赛,预选赛通过才能进入正式赛,正式赛每个阶段数据规模又不同、通过初赛的可以使用在线平台GPU算力,当然数据规模也增长到10亿,相对来说算力还是比较紧张,这时候需要对自己写的代码做优化合理分配算力资源,不能出现GPU占用过高卡掉、和算力利用不起来这种情况。然后就是参赛人的能力问题了,这个比赛还是需要多人协作才比较好点,毕竟在这种竞争激烈的赛事中想法多一点总是好的。同时对个人代码能力、思考能力、快速学习能力、协作能力、经验都有很大的要求,总之这个比赛在很大程度上也让我学到了很多东西 O(∩_∩)O
以下仅为特征工程部分代码,主要是一些方法函数,下次使用可以有印象即可,代码比较多,整理起来比较麻烦,仅为自己记录用,有时间会继续详细整理,若有人参考,见谅 <(^-^)>
def get_qAUC(qid_set,Y_pred,Y_test):
valid1 = pd.DataFrame()
valid1['query_id'] = qid_set
valid1["pred_prob"] = Y_pred
valid1['label'] = Y_test
vg = valid1.groupby(['query_id'])
aucs = []
for i in vg:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
aucs.append(0.5)
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
aucs.append(auc(fpr, tpr))
qAUC = np.average(aucs)
return qAUC
def selectSampleOne(dta, threshold=0.2):
''' 根据阈值选择数据样本集,主要是一个query下的title点击个数和总的title数比值作为阈值。 Input: dta, DataFrame对象; threshold 阈值 默认0.2 OutPut: 过滤后的DataFrame对象 主要方法是分组计算比值拼接 根据条件筛选 '''
lab_mean = dta[['query_id', 'label']].groupby('query_id').mean().rename(columns={'label':'lab_mean'})
dta = pd.merge(dta, lab_mean, on='query_id', how='left')
dta = dta[(dta['lab_mean'] >= threshold)]
del dta['lab_mean']
return dta
def selectSampleTwo(dta, threshold=0.2):
''' 根据阈值选择数据样本集,主要是一个query下的title点击个数和总的title数比值作为阈值。 Input: dta, DataFrame对象; threshold 阈值 默认0.2 OutPut: 过滤后的DataFrame对象 主要方法是索引计算 '''
groupbyed = dta[['query_id', 'label']].groupby('query_id').mean()
index = groupbyed.index
lab_mean = groupbyed['label'].values
select = []
for _ in range(len(lab_mean)):
if lab_mean[_] > threshold:
select.append(index[_])
dta = dta[(np.isin(dta.query_id, np.array(select)))]
return dta
def XGBoostONELR3(X_train, X_test, Y_train, Y_test, X_train_lr, Y_train_lr):
XGB = xgb.XGBClassifier(nthread=14, learning_rate=0.08, n_estimators=100)
XGB.fit(X_train, Y_train)
print('XGB 训练结束------>')
OHE = OneHotEncoder()
OHE.fit(XGB.apply(X_train))
print('ONE 训练结束------>')
ploy = PolynomialFeatures(degree=2).fit(X_train_lr)
print('ploy 训练结束------>')
tran = np.hstack((OHE.transform(XGB.apply(X_train_lr)).toarray(),ploy.transform(X_train_lr)))
print('hstack 训练结束------>')
LR = LogisticRegression(n_jobs=14, C=0.06, penalty='l1')
LR.fit(tran, Y_train_lr)
del tran
print('LR 训练结束------>')
Y_pred = LR.predict_proba(np.hstack((OHE.transform(XGB.apply(X_test)).toarray(),ploy.transform(X_test))))[:, 1]
print('XGBoost + LogisticRegression: log_loss', log_loss(Y_test, Y_pred))
return XGB, OHE, LR, ploy
# query点击率
def query_ctr(df):
item = "query"
temp = df.groupby(item,as_index=False)['label'].agg({item+'_click':'sum', item+'_count':'count'})
temp[item+'_ctr'] = temp[item+'_click']/(temp[item+'_count'])
df = pd.merge(df, temp, on=item, how='left')
return df
## 文本特征
# query、title长度
def get_length(row):
temp = len(row.strip().split(' '))
return temp
# 判断query是否为title的子集,求交集,差集,对称差集
def get_subset(row):
temp1 = set(row['query'].strip().split(' '))
temp2 = row['title'].strip().split(' ')
if temp1.issubset(temp2):
return 1
else:
return 0
def get_inter_length(row):
temp1 = set(row['query'].strip().split(' '))
temp2 = set(row['title'].strip().split(' '))
inter_length = len(temp1 & temp2) # 交集
return inter_length
def get_differ_length(row):
temp1 = set(row['query'].strip().split(' '))
temp2 = set(row['title'].strip().split(' '))
differ_length = len(temp2 - temp1) # 差集
return differ_length
# 判断title是否为query的子集
def get_subset2(row):
temp1 = set(row['query'].strip().split(' '))
temp2 = set(row['title'].strip().split(' '))
if temp2.issubset(temp1):
return 1
else:
return 0
# query的第一个词,在title中的位置。不在则=-1
def get_location(row):
str1 = row['query'].strip().split(' ')[0]
str2 = row['title'].strip().split(' ')
if str1 in str2:
location = str2.index(str1)
else:
location = -1
return location
## 语义相似度
# 计算query、tile的编辑距离
def difflib_leven(row):
str1 = row['query'].strip()
str2 = row['title'].strip()
leven_cost = 0
s = difflib.SequenceMatcher(None, str1, str2)
for tag, i1, i2, j1, j2 in s.get_opcodes():
# print('{:7} a[{}: {}] --> b[{}: {}] {} --> {}'.format(tag, i1, i2, j1, j2, str1[i1: i2], str2[j1: j2]))
if tag == 'replace':
leven_cost += max(i2-i1, j2-j1)
elif tag == 'insert':
leven_cost += (j2-j1)
elif tag == 'delete':
leven_cost += (i2-i1)
return leven_cost
# 计算query、title的汉明距离
def hamming_distance(row):
temp1 = row['query'].strip().split(' ')
temp2 = row['title'].strip().split(' ')
if (len(temp1) < len(temp2)):
temp_max = len(temp2)
temp_min = len(temp1)
else:
temp_max = len(temp1)
temp_min = len(temp2)
ham_distance = temp_max - temp_min
for i in range(temp_min):
if temp1[i] != temp2[i]:
ham_distance += 1
return ham_distance
# 计算query、title的余弦距离
def cos_distance(row):
temp1 = row['query'].strip().split(' ')
temp2 = row['title'].strip().split(' ')
union_set = list(set(temp1 + temp2))
vector1 = np.zeros(len(union_set))
vector2 = np.zeros(len(union_set))
# 依次确定向量的每个位置的值
for i in range(len(union_set)):
# 遍历key_word中每个词在句子中的出现次数
for k1 in range(len(temp1)):
if union_set[i] == temp1[k1]:
vector1[i] += 1
for k2 in range(len(temp2)):
if union_set[i] == temp2[k2]:
vector2[i] += 1
cos_dist = float(np.dot(vector1,vector2)/(np.linalg.norm(vector1)*np.linalg.norm(vector2)))
return cos_dist
# 并集
def get_union_length(row):
temp1 = set(row['query'].strip().split(' '))
temp2 = set(row['title'].strip().split(' '))
union_length = len(temp1 | temp2) # 并集
return union_length
#doc2vec相似度
model_dm = Doc2Vec.load('doc2vec.model')
def doc_sims(row,doc2vec_model=model_dm):
# inferred_vector_dm1 = doc2vec_model.infer_vector(row[1].strip().split(' '))
inferred_vector_dm1 = np.mean(doc2vec_model[row[1].strip().split(' ')],axis=0)
inferred_vector_dm2 = doc2vec_model.infer_vector(row[3].strip().split(' '))
# inferred_vector_dm2 = np.mean(doc2vec_model[row[3].strip().split(' ')],axis=0)
sims = (np.dot(inferred_vector_dm1,inferred_vector_dm2))/(np.linalg.norm(inferred_vector_dm1) * np.linalg.norm(inferred_vector_dm2))
return sims
# 差集2
def get_differ_length2(row):
temp1 = set(row['query'].strip().split(' '))
temp2 = set(row['title'].strip().split(' '))
differ_length2 = len(temp1 - temp2) # 差集
return differ_length2
w2v_model = gensim.models.KeyedVectors.load_word2vec_format("word2vec.model")
def get_sim(row, w2v=w2v_model):
spl_query_list = row[1].strip().split(' ')
spl_title_list = row[3].strip().split(' ')
simfte = [0, 0]
# 与title中每个词相关度最高的10个词
NoFindNum = 0
FindNum = 0
for title in spl_title_list:
try:
topn = [_[0] for _ in w2v.wv.most_similar([title])]
FindNum += 1
except:
NoFindNum += 1
topn = []
for word in topn:
if word in spl_query_list:
simfte[0]+=1
try:
topn = [_[0] for _ in w2v.wv.most_similar(spl_title_list)]
FindNum += 1
except:
NoFindNum += 1
topn = []
for word in topn:
if word in spl_query_list:
simfte[1]+=1
return [simfte[0], simfte[1], FindNum, NoFindNum]
# 逆序数特征
def get_inverse_Zahl(row):
query_list = row['query'].strip().split(' ')
title_list = row['title'].strip().split(' ')
title_list2 = list(set(title_list))
title_list2.sort(key=title_list.index)
inter_list = list(set(query_list)&set(title_list))
inter_list.sort(key=query_list.index)
if inter_list:
index_list = []
for word in inter_list:
index_list.append(title_list2.index(word))
ans = 0
for i in range(len(index_list)):
for j in range(i+1,len(index_list)):
if index_list[i] > index_list[j]:
ans += 1
ans = ans/len(index_list)
else:
ans = -1
return ans
# 互信息
def get_mutual_score(row):
spl_query_list = row["query"].strip().split(' ')
spl_title_list = row["title"].strip().split(' ')
word_voca = list(set(set(spl_query_list)|set(spl_title_list)))
inferred_vector_dm1 = np.zeros(len(word_voca))
inferred_vector_dm2 = np.zeros(len(word_voca))
for word in spl_query_list:
if word in word_voca:
inferred_vector_dm1[word_voca.index(word)] = 1
for word in spl_title_list:
if word in word_voca:
inferred_vector_dm2[word_voca.index(word)] = 1
mutual_score = mr.mutual_info_score(inferred_vector_dm1,inferred_vector_dm2)
return mutual_score
## Quora Question Pairs lpty借鉴特征
# 共现词
def Concurrence(row):
q1words = {}
q2words = {}
query_list = row['query'].strip().split(' ')
title_list = row['title'].strip().split(' ')
for word in query_list:
q1words[word] = q1words.get(word, 0) + 1
for word in title_list:
q2words[word] = q2words.get(word, 0) + 1
n_shared_word_in_q1 = sum([q1words[w] for w in q1words if w in q2words])
n_shared_word_in_q2 = sum([q2words[w] for w in q2words if w in q1words])
n_tol = sum(q1words.values()) + sum(q2words.values())
if 1e-6 > n_tol:
return 0.
else:
return 1.0 * (n_shared_word_in_q1 + n_shared_word_in_q2) / n_tol
# 最长公共子串or子序列
def LongestCommonSeq(row):
seq1 = row['query'].strip()
seq2 = row['title'].strip()
if len(seq1) == 0 or len(seq2) == 0:
return 0.0
m = [[0 for _ in range(len(seq2)+1)] for _ in range(len(seq1)+1)]
for p1 in range(1, len(seq1)+1):
for p2 in range(1, len(seq2)+1):
if seq1[p1-1] == seq2[p2-1]:
m[p1][p2] = m[p1 - 1][p2 - 1] + 1
else:
m[p1][p2] = max(m[p1 - 1][p2], m[p1][p2 - 1])
a = m[-1][-1]
b = max(len(seq1), len(seq2))
return a / b
# jaccard_distance
def jaccard_distance(row):
seq1 = row['query'].strip()
seq2 = row['title'].strip()
jaccard = distance.jaccard(seq1, seq2)
return jaccard
# sorensen_distance
def sorensen_distance(row):
seq1 = row['query'].strip()
seq2 = row['title'].strip()
sorensen = distance.sorensen(seq1, seq2)
return sorensen
# 词级长度比
def WordLengthDiffRatio(row):
q1 = row['query'].strip().split(' ')
q2 = row['title'].strip().split(' ')
return min(len(q1), len(q2)) / max(len(q1), len(q2))
# 句级长度差
def CharLengthDiff(row):
q1 = row['query'].strip()
q2 = row['title'].strip()
return abs(len(q1) - len(q2))
# 句级长度比
def CharLengthDiffRatio(row):
q1 = row['query'].strip()
q2 = row['title'].strip()
return min(len(q1), len(q2)) / max(len(q1), len(q2))
### 表示特征
# 'W2V_cosine', 'W2V_euclidean', 'W2V_manhattan'
def w2v_distance(row,doc2vec_model=model_dm2):
# inferred_vector_dm1 = doc2vec_model.infer_vector(row[1].strip().split(' '))
vec_seq1 = np.mean(doc2vec_model[row["query"].strip().split(' ')],axis=0)
# inferred_vector_dm2 = doc2vec_model.infer_vector(row[3].strip().split(' '))
vec_seq2 = np.mean(doc2vec_model[row["title"].strip().split(' ')],axis=0)
cos_sim = 1 - cosine(vec_seq1, vec_seq2) # 其实就是一直想计算的doc_sims2
euclidean_sim = 1 - euclidean(vec_seq1, vec_seq2)
manhattan_sim = 1 - cityblock(vec_seq1, vec_seq2)
return [cos_sim, euclidean_sim, manhattan_sim]
#### WordMoverDistance
def wmd(document1, document2, model):
# Remove out-of-vocabulary words.
document1 = [token for token in document1 if token in model]
document2 = [token for token in document2 if token in model]
if len(document1) == 0 or len(document2) == 0:
return 1.
dictionary = Dictionary(documents=[document1, document2])
vocab_len = len(dictionary)
# Compute distance matrix.
distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
for i, t1 in list(dictionary.items()):
for j, t2 in list(dictionary.items()):
distance_matrix[i, j] = cosine(model[t1], model[t2])
if np_sum(distance_matrix) == 0.0:
# `emd` gets stuck if the distance matrix contains only zeros.
return 0.
def nbow(document):
d = zeros(vocab_len, dtype=double)
nbow = dictionary.doc2bow(document) # Word frequencies.
doc_len = len(document)
for idx, freq in nbow:
d[idx] = freq / float(doc_len) # Normalized word frequencies.
return d
# Compute nBOW representation of documents. d1 = nbow(document1) d2 = nbow(document2)
# Compute WMD. res = emd(d1, d2, distance_matrix) return res if res >= 0 else 1
# WordMoverDistance def WordMoverDistance(row,model=w2v_model2):
seq1 = row["query"].strip().split(' ')
seq2 = row["title"].strip().split(' ')
wmd_distance = 1 - wmd(seq1, seq2, model)
return wmd_distance
### W2VWeightDistance
def calculate_keyword(q,idf,idf_vocab,model):
csr = idf.transform([q])
index = csr.indices.tolist()
values = csr.data.tolist()
keyword_index = zip(index, values)
keywords = [(idf_vocab[i[0]], i[1]) for i in keyword_index]
keywords = [word for word in keywords if word[0] in model]
return keywords
def calculate_vector(keywords,model):
vectors = np.array([model[word[0]]*word[1] for word in keywords])
vector = np.mean(vectors, axis=0)
return vector
# 'W2VWeight_cosine', 'W2VWeight_euclidean', 'W2VWeight_manhattan'
def W2VWeightDistance(row,idf=idf,idf_vocab=idf_vocab,model=w2v_model2):
q1 = row['query'].strip()
q2 = row['title'].strip()
q1_keywords = calculate_keyword(q1,idf,idf_vocab,model)
q2_keywords = calculate_keyword(q2,idf,idf_vocab,model)
if not len(q1_keywords) or not len(q2_keywords):
return [0.0, 0.0, 0.0]
q1_vector = calculate_vector(q1_keywords,model)
q2_vector = calculate_vector(q2_keywords,model)
cos_sim = 1 - cosine(q1_vector, q2_vector)
euclidean_sim = 1 - euclidean(q1_vector, q2_vector)
manhattan_sim = 1 - cityblock(q1_vector, q2_vector)
return [cos_sim, euclidean_sim, manhattan_sim]
#### NGramW2VDistance
def calculate_gram(seq, n, w2v):
seq = [word for word in seq if word in w2v]
n_gram = [seq[i: n+i] for i in range(len(seq)) if n+i <= len(seq)]
if not n_gram: n_gram = [seq]
return n_gram
def calculate_sim(seq1, seq2, w2v):
seq1_w2vs = [np.mean([w2v[word] for word in seq], axis=0) for seq in seq1]
seq2_w2vs = [np.mean([w2v[word] for word in seq], axis=0) for seq in seq2]
sims = [1 - cosine(seq1_w2v, seq2_w2v) for seq1_w2v in seq1_w2vs for seq2_w2v in seq2_w2vs]
seq1_sim = np.mean([max(sims[i:i+len(seq2_w2vs)]) for i in range(0, len(sims), len(seq2_w2vs))])
seq2_sim = np.mean([max(sims[i::len(seq2_w2vs)]) for i in range(len(seq2_w2vs))])
sim = np.mean([seq1_sim, seq2_sim])
return sim
def calculate_gram_sim(seq1, seq2, n, w2v):
n_gram_q1 = calculate_gram(seq1, n, w2v)
n_gram_q2 = calculate_gram(seq2, n, w2v)
n_gram_sim = calculate_sim(n_gram_q1, n_gram_q2, w2v)
return n_gram_sim
# 'NGramW2V_one', 'NGramW2V_two', 'NGramW2V_three'
def NGramW2VDistance(row,w2v=w2v_model2):
q1 = row['query'].strip().split(' ')
q2 = row['title'].strip().split(' ')
one_gram_sim = calculate_gram_sim(q1, q2, 1, w2v)
two_gram_sim = calculate_gram_sim(q1, q2, 2, w2v)
three_gram_sim = calculate_gram_sim(q1, q2, 3, w2v)
return [one_gram_sim, two_gram_sim, three_gram_sim]
### NGramTFIDFDistance
def calculate_gram2(seq, n,idf_vocab):
seq = [word for word in seq if word in idf_vocab]
n_gram = [seq[i: n+i] for i in range(len(seq)) if n+i <= len(seq)]
if not n_gram: n_gram = [seq]
return n_gram
def calculate_sim2(seq1, seq2, idf):
seq1_idfs = [idf.transform([' '.join(seq)])[0] for seq in seq1]
seq2_idfs = [idf.transform([' '.join(seq)])[0] for seq in seq2]
sims = [1 - cosine(seq1_idf.toarray(), seq2_idf.toarray()) for seq1_idf in seq1_idfs for seq2_idf in seq2_idfs]
seq1_sim = np.mean([max(sims[i:i+len(seq2_idfs)]) for i in range(0, len(sims), len(seq2_idfs))])
seq2_sim = np.mean([max(sims[i::len(seq2_idfs)]) for i in range(len(seq2_idfs))])
sim = np.mean([seq1_sim, seq2_sim])
return sim
def calculate_gram_sim2(seq1, seq2, n, idf, idf_vocab):
n_gram_q1 = calculate_gram2(seq1, n, idf_vocab)
n_gram_q2 = calculate_gram2(seq2, n, idf_vocab)
n_gram_sim = calculate_sim2(n_gram_q1, n_gram_q2, idf)
return n_gram_sim
# 'NGramTFIDF_one', 'NGramTFIDF_two', 'NGramTFIDF_three'
def NGramTFIDFDistance(row,idf=idf,idf_vocab=idf_vocab):
q1 = row['query'].strip().split(' ')
q2 = row['title'].strip().split(' ')
one_gram_sim = calculate_gram_sim2(q1, q2, 1,idf,idf_vocab)
two_gram_sim = calculate_gram_sim2(q1, q2, 2,idf,idf_vocab)
three_gram_sim = calculate_gram_sim2(q1, q2, 3,idf,idf_vocab)
return [one_gram_sim, two_gram_sim, three_gram_sim]