python爬取猫眼电影top_100
初学者专用,超基础
import re
import requests
from urllib3.exceptions import RequestError
import csv
import pandas as pd
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestError:
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+ '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern, html)
index = []
image = []
title = []
actor = []
time = []
score = []
for item in items:
index.append(item[0])
image.append(item[1])
title.append(item[2])
actor.append(item[3].strip()[3:])
time.append(item[4].strip()[5:])
score.append(item[5] + item[6])
return index,image,title,actor,time,score
def write_to_file(index,image,title,actor,time,score):
test = pd.DataFrame({"index": index, 'image': image, 'title': title, 'actor': actor, 'time': time, 'score':score})
test.to_csv('MaoY_top10_movie.csv', sep=',', mode='a', encoding='utf-8-sig', index=False, header=0)
def add_file_head():
with open('MaoY_top10_movie.csv','w',newline='',encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(["index", "image","title","actor","time","score"])
def main(offset):
url = "http://maoyan.com/board/4?offset=" + str(offset)
html = get_one_page(url)
parse_one_page(html)
write_to_file(parse_one_page(html)[0],
parse_one_page(html)[1],
parse_one_page(html)[2],
parse_one_page(html)[3],
parse_one_page(html)[4],
parse_one_page(html)[5])
if __name__ == '__main__':
add_file_head()
for i in range(10):
main(i*10)