Scrapy实践:爬取腾讯社会招聘信息(文字爬取)
注:爬取后的信息将以json格式存储,并将文件命名为“recruit.json”可用Notepad++打开。
代码实现:
items.py
# -*- coding: utf-8 -*-
import scrapy
class TxhrItem(scrapy.Item):
#职位名称
positionName = scrapy.Field()
#职位类别
positionType = scrapy.Field()
#需求人数
needNum = scrapy.Field()
#工作地点
workingSpace = scrapy.Field()
#发布时间
publishTime = scrapy.Field()
爬虫文件(spiders/txHRSpider.py)
# -*- coding: utf-8 -*-
import scrapy
from txHR.items import TxhrItem
class TxhrspiderSpider(scrapy.Spider):
name = 'txHR'
allowed_domains = ['tencent.com']
initialURL = 'https://hr.tencent.com/position.php?@start=&start='
bias = 0
url = initialURL + str(bias)
start_urls = [url]
def parse(self, response):
# even=偶,odd=奇
for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
#创建模型对象
item = TxhrItem()
item['positionName'] = each.xpath("td[1]/a/text()").extract()[0]
test_null = each.xpath("td[2]/text()").extract()
#因为网页中有的记录中的“职位类别”为空,如果不加上下面的判断,程序会在中途报错
if test_null == []:
item['positionType'] = "Null"
else:
item['positionType'] = test_null[0]
item['needNum'] = each.xpath("td[3]/text()").extract()[0]
item['workingSpace'] = each.xpath("td[4]/text()").extract()[0]
item['publishTime'] = each.xpath("td[5]/text()").extract()[0]
yield item
self.bias += 10
#抓取前1000条社会招聘信息
if self.bias < 1000:
url = self.initialURL+str(self.bias)
yield scrapy.Request(url, callback=self.parse)
pipelines.py
# -*- coding: utf-8 -*-
import json
class TxhrPipeline(object):
def __init__(self):
self.output = open("recruit.json", 'w')
def process_item(self, item, spider):
# 将爬取的信息先转换为字典,再转换为json格式的键值对
jsonText = json.dumps(obj=dict(item), ensure_ascii=False) + '\n'
self.output.write(jsonText)
return item
def close_spider(self):
self.output.close()
settings.py
BOT_NAME = 'txHR'
SPIDER_MODULES = ['txHR.spiders']
NEWSPIDER_MODULE = 'txHR.spiders'
ROBOTSTXT_OBEY = True
DEFAULT_REQUEST_HEADERS = {
'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
ITEM_PIPELINES = {
'txHR.pipelines.TxhrPipeline': 300,
}