爬虫案例(一)
在家闲来无事,初入python世界,尝试做一个爬虫项目,爬取囧事百科网站相关内容
下面直接干货
import requests
from lxml import etree
import json
class QiubaiSpider:
def __init__(self):
self.url_trmp = "http://www.cnxox.com/baike/p{}/"
def get_url_list(self):# 根据url地址的规律,构造url_list
url_list = [self.url_trmp.format(i) for i in range(1, 14)]
return url_list
def parse_url(self, url):# 发送请求,获取响应
response = requests.get(url)
return response.content.decode()
def get_content_list(self, html_str): # 提取数据
html = etree.HTML(html_str)
# 分组
div_list = html.xpath('//div[@class="content"]/article')
content_list =[]
for div in div_list:
item = {}
item["title"] = div.xpath('.//h2/a/@title')[0] if len(div.xpath('//h2/a/@title')) > 0 else None
item["note"] = div.xpath('.//p[@class="note"]/text()')
item["read"] = div.xpath('.//p[@class="text-muted views"]/span/text()')[0] if len(
div.xpath('.//p[@class="text-muted views"]/span/text()')) > 0 else None
item["zan"] = div.xpath('.//p[@class="text-muted views"]/a/span/text()')[0] if len(
div.xpath('.//p[@class="text-muted views"]/a/span/text()')) > 0 else None
item["img"] = div.xpath('.//p[@class="focus"]/a/span/span/img/@src')[0] if len(
div.xpath('.//p[@class="focus"]/a/span/span/img/@src')) > 0 else None
content_list.append(item)
return content_list
def save_content_list(self, content_list):
with open("qiubai.txt", 'a', encoding="utf-8") as f:
for content in content_list:
f.write(json.dumps(content, ensure_ascii=False))
f.write('\n')
print('保存成功')
# 实现主要逻辑
def run(self):
# 根据url地址的规律,构造url_list
url_list = self.get_url_list()
# 发送请求,获取响应
for url in url_list:
html_str = self.parse_url(url)
# 提取数据
content_list = self.get_content_list(html_str)
print(content_list)
for content in content_list:
print(content_list)
# 保存数据
self.save_content_list(content_list)
if __name__ == '__main__':
quibai = QiubaiSpider()
quibai.run()
运行结果:
总结:
书写爬虫脚本时,要按四步顺序书写代码:
- url
- 知道url地址的规律和总的页码数:构造url地址的列表
- start_url
- 发送请求获取响应
- requests
- 提取数据
- 返回json字符串:json模块
- 返回的是html字符串:lxml模块配合xpath提取数据
- 保存数据