Scrapy实践:爬取斗鱼TV主播的头像(重写ImagesPipeline实现图片爬取)
注:若运行以下代码报出有关“PIL”的错误,则只需安装pillow即可。
实现思路:
(1)使用Fiddler抓包工具,找出为斗鱼APP颜值区提供数据的URL(Json格式)
(2)在items中定义需要记录的相关信息
(3)在spider中实现迭代爬取各主播的信息
(4)在Pipeline中通过继承内置的ImagesPipeline类,重写其功能,实现图片的下载
(5)在settings.py文件中进行相关配置
代码实现:
items.py
# -*- coding: utf-8 -*-
import scrapy
class DouyuItem(scrapy.Item):
#图片链接
vertical_src = scrapy.Field()
#主播名
nickname = scrapy.Field()
#图片保存路径
imagePath = scrapy.Field()
爬虫文件(spiders/douyuMM.py)
# -*- coding: utf-8 -*-
import scrapy
import json
from Douyu.items import DouyuItem
class DouyummSpider(scrapy.Spider):
name = 'douyuMM'
allowed_domains = ['capi.douyucdn.cn']
initial_URL = 'http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset='
offset = 0
URL = initial_URL + str(offset)
start_urls = [URL]
def parse(self, response):
data = json.loads(response.text)['data']
for each in data:
item = DouyuItem()
item['nickname'] = each['nickname']
#将图片链接存储成列表形式,供pipelines迭代
image_url = each['vertical_src']
item['vertical_src'] = [image_url]
yield item
self.offset += 20
#抓取前100个主播的图片
if self.offset < 100:
self.URL = self.initial_URL + str(self.offset)
yield scrapy.Request(url=self.URL, callback=self.parse)
pipelines.py
# -*- coding: utf-8 -*-
import os
import scrapy
"""
pipelines提供了图片相关的方法,因此只需继承ImagesPipeline
并对“get_media_requests”和“item_completed”重写即可实现图片的下载
"""
from scrapy.pipelines.images import ImagesPipeline
#通过get_project_settings来获取settings.py文件中设置的变量
from scrapy.utils.project import get_project_settings
class DouyuImagesPipeline(ImagesPipeline):
IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
def get_media_requests(self, item, info):
for image_url in item['vertical_src']:
yield scrapy.Request(image_url)
"""
亦可用下面的代码,只不过下面的代码只能抓取一张图片
image_url = item['vertical_src']
yield scrapy.Request(image_url)
"""
def item_completed(self, results, item, info):
"""
result结构:
[(True,
{'checksum': '2b00042f7481c7b056c4b410d28f33cf',
'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg',
'url': 'http://www.example.com/files/product1.pdf'}),
(False,
Failure(...))]
"""
image_path = [x['path'] for ok, x in results if ok]
# 修改图片保存名称为主播昵称
# 并将爬取的图片存储在IMAGES_STORE设置的相对路径下,用“full”文件存储
os.rename(self.IMAGES_STORE + image_path[0], self.IMAGES_STORE + 'full/' + item["nickname"] + ".jpg")
item['imagePath'] = self.IMAGES_STORE + 'full/' + item["nickname"]
return item
settings.py
BOT_NAME = 'Douyu'
SPIDER_MODULES = ['Douyu.spiders']
NEWSPIDER_MODULE = 'Douyu.spiders'
ROBOTSTXT_OBEY = True
DEFAULT_REQUEST_HEADERS = {
'User-Agent':'DYZB/4.100 (iPhone; iOS 11.3.1; Scale/3.00)',
'Accept': 'application/vnd.mapi-yuba.douyu.com.4.0+json',
'Accept-Language': 'zh-Hans-CN;q=1'
}
IMAGES_STORE = 'data/斗鱼主播图片/'
ITEM_PIPELINES = {
'Douyu.pipelines.DouyuImagesPipeline': 300,
}