爬取豆瓣电影新片榜
python爬虫 — 豆瓣电影新片榜
import requests
import csv
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib3.exceptions import RequestError
def get_one_page(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.66"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestError:
return None
def parse_one_page(html):
soup = BeautifulSoup(html, "html.parser")
messages = soup.find_all("tr",class_="item")
top10 = []
for message in messages:
top10.append(message.a['title'])
return top10
def write_to_file(content):
index = [1,2,3,4,5,6,7,8,9,10]
#用pandas保存csv文件
test = pd.DataFrame({"index":index,'top10':content})
test.to_csv(r'Douban_top10_movie.csv',sep=',',encoding='utf-8-sig',index=False)
#用csv保存csv文件
"""with open("Douban_top10_movie.csv", "w",newline='',encoding='utf-8-sig') as csvfile: writer = csv.writer(csvfile) writer.writerow(["index", "top10"]) writer.writerows([index,content])"""
if __name__ =='__main__':
html = get_one_page('https://movie.douban.com/chart')
content = parse_one_page(html)
write_to_file(content)
爬取结果: