2019大数据挑战赛(特征工程部分代码)

网址:2019大数据挑战赛

一些感想:
大数据挑战赛比赛还是挺有含金量的,从参加的人数来说总人数将近5000,多数为高校在读研究生,比赛分预选赛 -> 初赛 -> 复赛 -> 决赛,预选赛通过才能进入正式赛,正式赛每个阶段数据规模又不同、通过初赛的可以使用在线平台GPU算力,当然数据规模也增长到10亿,相对来说算力还是比较紧张,这时候需要对自己写的代码做优化合理分配算力资源,不能出现GPU占用过高卡掉、和算力利用不起来这种情况。然后就是参赛人的能力问题了,这个比赛还是需要多人协作才比较好点,毕竟在这种竞争激烈的赛事中想法多一点总是好的。同时对个人代码能力、思考能力、快速学习能力、协作能力、经验都有很大的要求,总之这个比赛在很大程度上也让我学到了很多东西 O(∩_∩)O

以下仅为特征工程部分代码,主要是一些方法函数,下次使用可以有印象即可,代码比较多,整理起来比较麻烦,仅为自己记录用,有时间会继续详细整理,若有人参考,见谅 <(^-^)>

def get_qAUC(qid_set,Y_pred,Y_test):
    valid1 = pd.DataFrame()
    valid1['query_id'] = qid_set
    valid1["pred_prob"] = Y_pred
    valid1['label'] = Y_test
    vg = valid1.groupby(['query_id'])
    aucs = []
    for i in vg:
        tmpdf = i[1] 
        if len(tmpdf['label'].unique()) != 2:
            aucs.append(0.5)
            continue
        fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
        aucs.append(auc(fpr, tpr))
    qAUC = np.average(aucs)
    return qAUC
def selectSampleOne(dta, threshold=0.2):
    ''' 根据阈值选择数据样本集,主要是一个query下的title点击个数和总的title数比值作为阈值。 Input: dta, DataFrame对象; threshold 阈值 默认0.2 OutPut: 过滤后的DataFrame对象 主要方法是分组计算比值拼接 根据条件筛选 '''
    lab_mean = dta[['query_id', 'label']].groupby('query_id').mean().rename(columns={'label':'lab_mean'})
    dta = pd.merge(dta, lab_mean, on='query_id', how='left')
    dta = dta[(dta['lab_mean'] >= threshold)]
    del dta['lab_mean']
    return dta
def selectSampleTwo(dta, threshold=0.2):
    ''' 根据阈值选择数据样本集,主要是一个query下的title点击个数和总的title数比值作为阈值。 Input: dta, DataFrame对象; threshold 阈值 默认0.2 OutPut: 过滤后的DataFrame对象 主要方法是索引计算 '''
    groupbyed = dta[['query_id', 'label']].groupby('query_id').mean()
    index = groupbyed.index
    lab_mean = groupbyed['label'].values
    
    select = []
    for _ in range(len(lab_mean)):
        if lab_mean[_] > threshold:
            select.append(index[_])

    dta = dta[(np.isin(dta.query_id, np.array(select)))]
    
    return dta
def XGBoostONELR3(X_train, X_test, Y_train, Y_test, X_train_lr, Y_train_lr):
    XGB = xgb.XGBClassifier(nthread=14, learning_rate=0.08, n_estimators=100)
    XGB.fit(X_train, Y_train)
    print('XGB 训练结束------>')
    OHE = OneHotEncoder()
    OHE.fit(XGB.apply(X_train))
    print('ONE 训练结束------>')
    
    ploy = PolynomialFeatures(degree=2).fit(X_train_lr)
    print('ploy 训练结束------>')
    tran = np.hstack((OHE.transform(XGB.apply(X_train_lr)).toarray(),ploy.transform(X_train_lr)))
    print('hstack 训练结束------>')
    LR = LogisticRegression(n_jobs=14, C=0.06, penalty='l1')
    LR.fit(tran, Y_train_lr)
    del tran
    print('LR 训练结束------>')
    
    Y_pred = LR.predict_proba(np.hstack((OHE.transform(XGB.apply(X_test)).toarray(),ploy.transform(X_test))))[:, 1]
    print('XGBoost + LogisticRegression: log_loss', log_loss(Y_test, Y_pred))
    return XGB, OHE, LR, ploy
# query点击率
def query_ctr(df):
    item = "query"
    temp = df.groupby(item,as_index=False)['label'].agg({item+'_click':'sum', item+'_count':'count'})
    temp[item+'_ctr'] = temp[item+'_click']/(temp[item+'_count'])
    df = pd.merge(df, temp, on=item, how='left')
    
    return df
## 文本特征
# query、title长度
def get_length(row):
    temp = len(row.strip().split(' '))
    return temp
    
# 判断query是否为title的子集,求交集,差集,对称差集
def get_subset(row):
    temp1 = set(row['query'].strip().split(' '))
    temp2 = row['title'].strip().split(' ')
    
    if temp1.issubset(temp2):
        return 1
    else:
        return 0

def get_inter_length(row):
    temp1 = set(row['query'].strip().split(' '))
    temp2 = set(row['title'].strip().split(' '))
    
    inter_length = len(temp1 & temp2) # 交集
    
    return inter_length

def get_differ_length(row):
    temp1 = set(row['query'].strip().split(' '))
    temp2 = set(row['title'].strip().split(' '))
        
    differ_length = len(temp2 - temp1) # 差集
    
    return differ_length

# 判断title是否为query的子集
def get_subset2(row):
    temp1 = set(row['query'].strip().split(' '))
    temp2 = set(row['title'].strip().split(' '))
    
    if temp2.issubset(temp1):
        return 1
    else:
        return 0    

# query的第一个词,在title中的位置。不在则=-1
def get_location(row):
    str1 = row['query'].strip().split(' ')[0]
    str2 = row['title'].strip().split(' ')
    
    if str1 in str2:
        location = str2.index(str1)
    else:
        location = -1
    
    return location
## 语义相似度
# 计算query、tile的编辑距离

def difflib_leven(row):
    str1 = row['query'].strip()
    str2 = row['title'].strip()
    leven_cost = 0
    s = difflib.SequenceMatcher(None, str1, str2)
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        # print('{:7} a[{}: {}] --> b[{}: {}] {} --> {}'.format(tag, i1, i2, j1, j2, str1[i1: i2], str2[j1: j2]))
   
        if tag == 'replace':
            leven_cost += max(i2-i1, j2-j1)
        elif tag == 'insert':
            leven_cost += (j2-j1)
        elif tag == 'delete':
            leven_cost += (i2-i1)
    return leven_cost        


# 计算query、title的汉明距离
def hamming_distance(row):
    temp1 = row['query'].strip().split(' ')
    temp2 = row['title'].strip().split(' ')
    
    if (len(temp1) < len(temp2)):
        temp_max = len(temp2)
        temp_min = len(temp1)
    else:
        temp_max = len(temp1)
        temp_min = len(temp2)
        
    ham_distance = temp_max - temp_min
    
    for i in range(temp_min):
        if  temp1[i] != temp2[i]:
            ham_distance += 1
    
    return ham_distance


# 计算query、title的余弦距离
def cos_distance(row):
    temp1 = row['query'].strip().split(' ')
    temp2 = row['title'].strip().split(' ')
    
    union_set = list(set(temp1 + temp2))
    
    vector1 = np.zeros(len(union_set))
    vector2 = np.zeros(len(union_set))
    
    # 依次确定向量的每个位置的值
    for i in range(len(union_set)):
        # 遍历key_word中每个词在句子中的出现次数
        for k1 in range(len(temp1)):
            if union_set[i] == temp1[k1]:
                vector1[i] += 1
        for k2 in range(len(temp2)):
            if union_set[i] == temp2[k2]:
                vector2[i] += 1
    
    cos_dist = float(np.dot(vector1,vector2)/(np.linalg.norm(vector1)*np.linalg.norm(vector2)))
    
    return cos_dist
# 并集
def get_union_length(row):
    temp1 = set(row['query'].strip().split(' '))
    temp2 = set(row['title'].strip().split(' '))
    
    union_length = len(temp1 | temp2) # 并集
    
    return union_length

#doc2vec相似度
model_dm = Doc2Vec.load('doc2vec.model')
def doc_sims(row,doc2vec_model=model_dm):
    # inferred_vector_dm1 = doc2vec_model.infer_vector(row[1].strip().split(' '))
    inferred_vector_dm1 = np.mean(doc2vec_model[row[1].strip().split(' ')],axis=0)
    inferred_vector_dm2 = doc2vec_model.infer_vector(row[3].strip().split(' '))
    # inferred_vector_dm2 = np.mean(doc2vec_model[row[3].strip().split(' ')],axis=0)
    
    sims = (np.dot(inferred_vector_dm1,inferred_vector_dm2))/(np.linalg.norm(inferred_vector_dm1) * np.linalg.norm(inferred_vector_dm2))
    
    return sims
# 差集2 
def get_differ_length2(row):
    temp1 = set(row['query'].strip().split(' '))
    temp2 = set(row['title'].strip().split(' '))
        
    differ_length2 = len(temp1 - temp2) # 差集
    
    return differ_length2
w2v_model = gensim.models.KeyedVectors.load_word2vec_format("word2vec.model")
def get_sim(row, w2v=w2v_model):
    
    spl_query_list = row[1].strip().split(' ')
    spl_title_list = row[3].strip().split(' ')
    
    simfte = [0, 0]
    # 与title中每个词相关度最高的10个词
    NoFindNum = 0
    FindNum = 0
    for title in spl_title_list:
        try:
            topn = [_[0] for _ in w2v.wv.most_similar([title])]
            FindNum += 1
        except:
            NoFindNum += 1
            topn = []
        for word in topn:
            if word in spl_query_list:
                simfte[0]+=1
    try:
        topn = [_[0] for _ in w2v.wv.most_similar(spl_title_list)]
        FindNum += 1
    except:
        NoFindNum += 1
        topn = []
    for word in topn:
        if word in spl_query_list:
            simfte[1]+=1
    return  [simfte[0], simfte[1], FindNum, NoFindNum]
    
# 逆序数特征
def get_inverse_Zahl(row):
    query_list = row['query'].strip().split(' ')
    title_list = row['title'].strip().split(' ')
    title_list2 = list(set(title_list))
    title_list2.sort(key=title_list.index)
    inter_list = list(set(query_list)&set(title_list))
    inter_list.sort(key=query_list.index)
    if inter_list:
        index_list = []
        for word in inter_list:
            index_list.append(title_list2.index(word))
        ans = 0
        for i in range(len(index_list)):
            for j in range(i+1,len(index_list)):
                if index_list[i] > index_list[j]:
                    ans += 1
        ans = ans/len(index_list)
    else:
        ans = -1
    return ans
# 互信息
def get_mutual_score(row):
    spl_query_list = row["query"].strip().split(' ')
    spl_title_list = row["title"].strip().split(' ')
    
    word_voca = list(set(set(spl_query_list)|set(spl_title_list)))
    
    inferred_vector_dm1 = np.zeros(len(word_voca))
    inferred_vector_dm2 = np.zeros(len(word_voca))
    
    for word in spl_query_list:
        if word in word_voca:
            inferred_vector_dm1[word_voca.index(word)] = 1
    for word in spl_title_list:
        if word in word_voca:
            inferred_vector_dm2[word_voca.index(word)] = 1
            
    mutual_score = mr.mutual_info_score(inferred_vector_dm1,inferred_vector_dm2)
   
    return mutual_score
## Quora Question Pairs lpty借鉴特征
# 共现词
def Concurrence(row):
    q1words = {}
    q2words = {}
    
    query_list = row['query'].strip().split(' ')
    title_list = row['title'].strip().split(' ')
    
    for word in query_list:
        q1words[word] = q1words.get(word, 0) + 1
    for word in title_list:
        q2words[word] = q2words.get(word, 0) + 1
    
    n_shared_word_in_q1 = sum([q1words[w] for w in q1words if w in q2words])
    n_shared_word_in_q2 = sum([q2words[w] for w in q2words if w in q1words])
    n_tol = sum(q1words.values()) + sum(q2words.values())
    
    if 1e-6 > n_tol:
        return 0.
    else:
        return 1.0 * (n_shared_word_in_q1 + n_shared_word_in_q2) / n_tol
# 最长公共子串or子序列
def LongestCommonSeq(row):
    seq1 = row['query'].strip()
    seq2 = row['title'].strip()
    
    if len(seq1) == 0 or len(seq2) == 0:
        return 0.0
    m = [[0 for _ in range(len(seq2)+1)] for _ in range(len(seq1)+1)]
    
    for p1 in range(1, len(seq1)+1):
        for p2 in range(1, len(seq2)+1):
            if seq1[p1-1] == seq2[p2-1]:
                m[p1][p2] = m[p1 - 1][p2 - 1] + 1
            else:
                m[p1][p2] = max(m[p1 - 1][p2], m[p1][p2 - 1])
    a = m[-1][-1]
    b = max(len(seq1), len(seq2))
    
    return a / b
# jaccard_distance
def jaccard_distance(row):
    seq1 = row['query'].strip()
    seq2 = row['title'].strip()
    
    jaccard = distance.jaccard(seq1, seq2)
    
    return jaccard
# sorensen_distance
def sorensen_distance(row):
    seq1 = row['query'].strip()
    seq2 = row['title'].strip()
    
    sorensen = distance.sorensen(seq1, seq2)
    
    return sorensen
# 词级长度比 
def WordLengthDiffRatio(row):
    q1 = row['query'].strip().split(' ')
    q2 = row['title'].strip().split(' ')
    
    return min(len(q1), len(q2)) / max(len(q1), len(q2))
# 句级长度差
def CharLengthDiff(row):
    q1 = row['query'].strip()
    q2 = row['title'].strip()
    
    return abs(len(q1) - len(q2))
# 句级长度比
def CharLengthDiffRatio(row):
    q1 = row['query'].strip()
    q2 = row['title'].strip()
    
    return min(len(q1), len(q2)) / max(len(q1), len(q2))
### 表示特征
# 'W2V_cosine', 'W2V_euclidean', 'W2V_manhattan'
def w2v_distance(row,doc2vec_model=model_dm2):
    # inferred_vector_dm1 = doc2vec_model.infer_vector(row[1].strip().split(' '))
    vec_seq1 = np.mean(doc2vec_model[row["query"].strip().split(' ')],axis=0)
    # inferred_vector_dm2 = doc2vec_model.infer_vector(row[3].strip().split(' '))
    vec_seq2 = np.mean(doc2vec_model[row["title"].strip().split(' ')],axis=0)
    
    cos_sim = 1 - cosine(vec_seq1, vec_seq2) # 其实就是一直想计算的doc_sims2
    euclidean_sim = 1 - euclidean(vec_seq1, vec_seq2)
    manhattan_sim = 1 - cityblock(vec_seq1, vec_seq2)
    return [cos_sim, euclidean_sim, manhattan_sim]
#### WordMoverDistance

def wmd(document1, document2, model):
    # Remove out-of-vocabulary words.
    document1 = [token for token in document1 if token in model]
    document2 = [token for token in document2 if token in model]
    if len(document1) == 0 or len(document2) == 0:
        return 1.
    dictionary = Dictionary(documents=[document1, document2])
    vocab_len = len(dictionary)
    # Compute distance matrix.
    distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
    for i, t1 in list(dictionary.items()):
        for j, t2 in list(dictionary.items()):
            distance_matrix[i, j] = cosine(model[t1], model[t2])
    if np_sum(distance_matrix) == 0.0:
        # `emd` gets stuck if the distance matrix contains only zeros.
        return 0.
def nbow(document):
    d = zeros(vocab_len, dtype=double)
    nbow = dictionary.doc2bow(document)  # Word frequencies.
    doc_len = len(document)
    for idx, freq in nbow:
        d[idx] = freq / float(doc_len)  # Normalized word frequencies.
    return d

# Compute nBOW representation of documents. d1 = nbow(document1) d2 = nbow(document2)
# Compute WMD. res = emd(d1, d2, distance_matrix) return res if res >= 0 else 1

# WordMoverDistance def WordMoverDistance(row,model=w2v_model2):
    seq1 = row["query"].strip().split(' ')
    seq2 = row["title"].strip().split(' ')
    
    wmd_distance = 1 - wmd(seq1, seq2, model)
    
    return wmd_distance
### W2VWeightDistance
def calculate_keyword(q,idf,idf_vocab,model):
    csr = idf.transform([q])
    index = csr.indices.tolist()
    values = csr.data.tolist()
    keyword_index = zip(index, values)
    keywords = [(idf_vocab[i[0]], i[1]) for i in keyword_index]
    keywords = [word for word in keywords if word[0] in model]
        
    return keywords


def calculate_vector(keywords,model):
    vectors = np.array([model[word[0]]*word[1] for word in keywords])
    vector = np.mean(vectors, axis=0)
    
    return vector
# 'W2VWeight_cosine', 'W2VWeight_euclidean', 'W2VWeight_manhattan'
def W2VWeightDistance(row,idf=idf,idf_vocab=idf_vocab,model=w2v_model2):
    q1 = row['query'].strip()
    q2 = row['title'].strip()
    
    q1_keywords = calculate_keyword(q1,idf,idf_vocab,model)
    q2_keywords = calculate_keyword(q2,idf,idf_vocab,model)
    
    if not len(q1_keywords) or not len(q2_keywords):
            return [0.0, 0.0, 0.0]
        
    q1_vector = calculate_vector(q1_keywords,model)
    q2_vector = calculate_vector(q2_keywords,model)
    
    cos_sim = 1 - cosine(q1_vector, q2_vector)
    euclidean_sim = 1 - euclidean(q1_vector, q2_vector)
    manhattan_sim = 1 - cityblock(q1_vector, q2_vector)
    
    return [cos_sim, euclidean_sim, manhattan_sim]

#### NGramW2VDistance
def calculate_gram(seq, n, w2v):
    seq = [word for word in seq if word in w2v]
    n_gram = [seq[i: n+i] for i in range(len(seq)) if n+i <= len(seq)]
    if not n_gram: n_gram = [seq]
    
    return n_gram
def calculate_sim(seq1, seq2, w2v):
    seq1_w2vs = [np.mean([w2v[word] for word in seq], axis=0) for seq in seq1]
    seq2_w2vs = [np.mean([w2v[word] for word in seq], axis=0) for seq in seq2]
    sims = [1 - cosine(seq1_w2v, seq2_w2v) for seq1_w2v in seq1_w2vs for seq2_w2v in seq2_w2vs]
    seq1_sim = np.mean([max(sims[i:i+len(seq2_w2vs)]) for i in range(0, len(sims), len(seq2_w2vs))])
    seq2_sim = np.mean([max(sims[i::len(seq2_w2vs)]) for i in range(len(seq2_w2vs))])
    sim = np.mean([seq1_sim, seq2_sim])
    
    return sim


def calculate_gram_sim(seq1, seq2, n, w2v):
    n_gram_q1 = calculate_gram(seq1, n, w2v)
    n_gram_q2 = calculate_gram(seq2, n, w2v)
    n_gram_sim = calculate_sim(n_gram_q1, n_gram_q2, w2v)
    
    return n_gram_sim
# 'NGramW2V_one', 'NGramW2V_two', 'NGramW2V_three'
def NGramW2VDistance(row,w2v=w2v_model2):
    q1 = row['query'].strip().split(' ')
    q2 = row['title'].strip().split(' ')
    one_gram_sim = calculate_gram_sim(q1, q2, 1, w2v)
    two_gram_sim = calculate_gram_sim(q1, q2, 2, w2v)
    three_gram_sim = calculate_gram_sim(q1, q2, 3, w2v)
    
    return [one_gram_sim, two_gram_sim, three_gram_sim]
### NGramTFIDFDistance

def calculate_gram2(seq, n,idf_vocab):
    seq = [word for word in seq if word in idf_vocab]
    n_gram = [seq[i: n+i] for i in range(len(seq)) if n+i <= len(seq)]
    if not n_gram: n_gram = [seq]
    
    return n_gram
def calculate_sim2(seq1, seq2, idf):
    seq1_idfs = [idf.transform([' '.join(seq)])[0] for seq in seq1]
    seq2_idfs = [idf.transform([' '.join(seq)])[0] for seq in seq2]
    sims = [1 - cosine(seq1_idf.toarray(), seq2_idf.toarray()) for seq1_idf in seq1_idfs for seq2_idf in seq2_idfs]
    seq1_sim = np.mean([max(sims[i:i+len(seq2_idfs)]) for i in range(0, len(sims), len(seq2_idfs))])
    seq2_sim = np.mean([max(sims[i::len(seq2_idfs)]) for i in range(len(seq2_idfs))])
    sim = np.mean([seq1_sim, seq2_sim])
    
    return sim


def calculate_gram_sim2(seq1, seq2, n, idf, idf_vocab):
    n_gram_q1 = calculate_gram2(seq1, n, idf_vocab)
    n_gram_q2 = calculate_gram2(seq2, n, idf_vocab)
    n_gram_sim = calculate_sim2(n_gram_q1, n_gram_q2, idf)
    
    return n_gram_sim
# 'NGramTFIDF_one', 'NGramTFIDF_two', 'NGramTFIDF_three'
def NGramTFIDFDistance(row,idf=idf,idf_vocab=idf_vocab):
    q1 = row['query'].strip().split(' ')
    q2 = row['title'].strip().split(' ')
    one_gram_sim = calculate_gram_sim2(q1, q2, 1,idf,idf_vocab)
    two_gram_sim = calculate_gram_sim2(q1, q2, 2,idf,idf_vocab)
    three_gram_sim = calculate_gram_sim2(q1, q2, 3,idf,idf_vocab)
    
    return [one_gram_sim, two_gram_sim, three_gram_sim]

全部评论

相关推荐

不愿透露姓名的神秘牛友
10-15 14:22
点赞 评论 收藏
分享
点赞 收藏 评论
分享
牛客网
牛客企业服务