练习-爬取豆瓣电影TOP250信息
数新的同学提问,刚好回忆一下大一小学期学习的爬虫(水一篇博客)。
与网上众多豆瓣250不同的是,此次需要爬取更多的内容(更麻烦一些)。
首先豆瓣是有反爬虫措施的(形同没有),之后用requests库和xpath轻轻松松爬到所有的信息,之后就是数据清洗。注意由于内容涉及到中文字符,全文需以utf-8格式处理,同时空白分隔符会被识别为\xa0(事儿多,直接去掉),其余就是无聊的字符串处理了。
很久没有写python了,基本语法记得还可以,水水这些简单的任务还是可行的。
#!/usr/bin/env python # -*- coding: utf-8 -*- from lxml import etree import requests import json headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } movie_list = [] def getRequests(): urls = [ "https://movie.douban.com/top250?start={}".format(str(i)) for i in range(0, 250, 25)] for url in urls: data = requests.get(url, headers=headers) html = etree.HTML(data.text) count = html.xpath("//div[@class='item']") # 这里是共有的xpath for info in count: titles = info.xpath("div[2]/div[1]/a/span/text()") # 电影名称 directors = info.xpath("div[2]/div[2]/p[1]/text()[1]") # 导演 year_country_classify = info.xpath( "div[2]/div[2]/p[1]/text()[2]") # 信息 stars = info.xpath("div[2]/div[2]/div/span[2]/text()") # 电影星评 starpeople = info.xpath("div[2]/div[2]/div/span[4]/text()") # 电影人数 details = info.xpath("div[2]/div[2]/p[2]/span/text()") # 电影的简介 # print(titles) # 标题 other_title = "" for i in range(len(titles)): if i == 0: continue else: other_title += titles[i].replace("\xa0", "").replace("/", "").strip()+" " # 导演 director = temp = "" flag = 0 temp = directors[0].replace("/", " ").replace("\n", "").strip() for i in temp: if i == ':': flag = (flag+1) % 2 elif i == "主": break elif flag == 1: director += i director = director.strip() year_country_classify = year_country_classify[0].replace( "\xa0", " ").replace("\n", "").strip() temp = year_country_classify.split("/") people = starpeople[0].replace("人评价", "") if not details: details.append("") movie_list.append({ "title": { "chinese": titles[0], "others": other_title }, "director": director, "year": temp[0].strip(), "country": temp[1].strip(), "classify": temp[2].strip(), "rating": { "num": stars[0], "people": people }, "quote": details[0] }) # for i in movie_list: # print(i) def output(): with open("豆瓣TOP250电影.json", "w+", encoding="UTF-8") as file: file.write(json.dumps({"data": movie_list}, ensure_ascii=False)) print("end") if __name__ == '__main__': getRequests() output()