Python爬取糗事百科热门段子

# -*- coding: cp936 -*-
import urllib
import urllib2
import re
import os
import xlwt

def open_url(page):
    head = {}
    head['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
    url = "https://www.qiushibaike.com/hot/page/" + str(page) +'/'
    req = urllib2.Request(url, headers=head)
    respone = urllib2.urlopen(req)
    html = respone.read().decode("utf-8")
    return html
    

#<img src="//pic.qiushibaike.com/system/avtnew/2476/24768804/thumb/20171109213309.JPEG?imageView2/1/w/90/h/90" alt="一炮敏℡恩仇">
p_name = r'<img src="//([^"]+)" alt="([^"]+)">'
#<div class="articleGender womenIcon">23</div>
p_age = r'<div class="articleGender ([^"]+)">([^"]{1,3})</div>'
#<i class="number">994</i>
p_laugh = r'<i class="number">([^"]{1,5})</i>'

#file_object = open('thefile.txt', 'w')
#file_object.write(str(list_age))
#file_object.close( )


f = xlwt.Workbook() #创建工作簿
sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) #创建sheet

for j in range(10):
    html = open_url(j+1)
    list_age = re.findall(p_age, html)
    list_name = re.findall(p_name,html)
    list_laugh = re.findall(p_laugh, html)
    for i in range(len(list_name)):
        sheet1.write(i+1+j*25,0,list_name[i][1])
    for i in range(len(list_age)):
        sheet1.write(i+1+j*25,1,list_age[i][1])
        sheet1.write(i+1+j*25,2,list_age[i][0])
    for i in range(len(list_laugh)/2):
        sheet1.write(i+1+j*25,3,list_laugh[2*(i+1)-1])
        sheet1.write(i+1+j*25,4,list_laugh[2*(i)])

sheet1.write(0,0,'name')
sheet1.write(0,1,'age')
sheet1.write(0,2,'gender')
sheet1.write(0,3,'comment num')
sheet1.write(0,4,'good num')
#sheet1.write(0,0,start_date,set_style('Times New Roman',220,True))
f.save('糗事百科.xls')#保存文件


'''
for each in list_name:
    for i in range(2):
        print('\s' % each[i])
for each in list_age:
    for i in range(2):
        print(each[i])
for each in list_laugh:
  print(each)
'''

全部评论

推荐最新楼层

11-23 15:40

武汉大学算法工程师

b站实习几周的体验

首先，压力真的不大，大家都很随意，不会给你太多的push，很轻松。还有，饮料价格比外面迁移很多，0.45元的可乐、雪碧和芬达。同事们也很nice，工作中给了我很多照顾。作息方面也很自由，基本上11点前到公司就行，午休两个小时，晚餐也有一个小时。

哔哩哔哩公司氛围 96人发布

点赞评论收藏

11-27 00:32

湖南铁道职业技术学院后端

后端offer选择

美团后端开发总包n(15%是股票)

点赞评论收藏

10-21 17:38

桂林理工大学 Java

秋招现状

霁华Tel：秋招结束了，好累。我自编了一篇对话，语言别人看不懂，我觉得有某种力量在控制我的身体，我明明觉得有些东西就在眼前，但身边的人却说啥也没有，有神秘人通过电视，手机等在暗暗的给我发信号，我有时候会突然觉得身体的某一部分不属于我了。面对不同的人或场合，我表现出不一样的自己，以至于都不知道自己到底是什么样子的人。我觉得我已经做的很好，不需要其他人的建议和批评，我有些时候难以控制的兴奋，但是呼吸都让人开心。

点赞评论收藏