英雄联盟Python爬虫
文章目录
-
- 4.多线程爬取LOL英雄皮肤图片
英雄联盟Python爬虫
英雄主界面qq https://lol.qq.com/data/info-heros.shtml
1.英雄爬取
https://lol.qq.com/data/info-heros.shtml
get方法获取指定英雄信息。
https://lol.qq.com/data/info-heros.shtml?id=xxx
id=xxx
2.JS获取所有英雄信息
import json import requests from faker import Factory from bs4 import BeautifulSoup f = Factory.create() def get_all_heros(): url = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js' headers = { 'user-agent': f.user_agent() } r = requests.get(url, headers=headers) r.encoding = r.apparent_encoding c = r.text l = json.loads(c)['hero'] for i in l[:50]: print("ID: {0} 姓名:{1} 别名:{2}".format(i['heroId'], i['name'], i['alias'])) if __name__ == '__main__': get_all_heros()
效果:
3.爬取比赛数据
第一个LOL网页爬取
http://www.wanplus.com/lol/playerstats
用到了csrf-token,post请求需要携带set-cookies 中的csrf-token即可。
import json import time import requests from faker import Factory from urllib import parse f = Factory.create() def get_token(): url = 'http://www.wanplus.com/lol/playerstats' headers = { 'user-agent': f.user_agent(), 'Referer': 'http://www.wanplus.com/lol/teamstats', 'Host': 'www.wanplus.com', } r = requests.get(url, headers=headers, allow_redirects=False) r.encoding = r.apparent_encoding c = r.cookies r.close() myCookies = c.get_dict() # print(myCookies) return str(int(c.get('wanplus_csrf')[9:]) + int(16777216)), myCookies def get_competition(): url = 'http://www.wanplus.com/ajax/stats/list' token, myCookies = get_token() headers = { 'user-agent': f.user_agent(), 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.wanplus.com', 'Origin': 'http://www.wanplus.com', 'Referer': 'http://www.wanplus.com/lol/playerstats', 'X-CSRF-Token': token, 'X-Requested-With': 'XMLHttpRequest', } formdata = { '_gtk': token, 'draw': '1', 'columns[0][data]': 'order', 'columns[0][name]': '', 'columns[0][searchable]': 'true', 'columns[0][orderable]': 'false', 'columns[0][search][value]': '', 'columns[0][search][regex]': 'false', 'columns[1][data]': 'playername', 'columns[1][name]': '', 'columns[1][searchable]': 'true', 'columns[1][orderable]': 'false', 'columns[1][search][value]': '', 'columns[1][search][regex]': 'false', 'columns[2][data]': 'teamname', 'columns[2][name]': '', 'columns[2][searchable]': 'true', 'columns[2][orderable]': 'false', 'columns[2][search][value]': '', 'columns[2][search][regex]': 'false', 'columns[3][data]': 'meta', 'columns[3][name]': '', 'columns[3][searchable]': 'true', 'columns[3][orderable]': 'false', 'columns[3][search][value]': '', 'columns[3][search][regex]': 'false', 'columns[4][data]': 'appearedTimes', 'columns[4][name]': '', 'columns[4][searchable]': 'true', 'columns[4][orderable]': 'true', 'columns[4][search][value]': '', 'columns[4][search][regex]': 'false', 'columns[5][data]': 'kda', 'columns[5][name]': '', 'columns[5][searchable]': 'true', 'columns[5][orderable]': 'true', 'columns[5][search][value]': '', 'columns[5][search][regex]': 'false', 'columns[6][data]': 'attendrate', 'columns[6][name]': '', 'columns[6][searchable]': 'true', 'columns[6][orderable]': 'true', 'columns[6][search][value]': '', 'columns[6][search][regex]': 'false', 'columns[7][data]': 'killsPergame', 'columns[7][name]': '', 'columns[7][searchable]': 'true', 'columns[7][orderable]': 'true', 'columns[7][search][value]': '', 'columns[7][search][regex]': 'false', 'columns[8][data]': 'mostkills', 'columns[8][name]': '', 'columns[8][searchable]': 'true', 'columns[8][orderable]': 'true', 'columns[8][search][value]': '', 'columns[8][search][regex]': 'false', 'columns[9][data]': 'deathsPergame', 'columns[9][name]': '', 'columns[9][searchable]': 'true', 'columns[9][orderable]': 'true', 'columns[9][search][value]': '', 'columns[9][search][regex]': 'false', 'columns[10][data]': 'mostdeaths', 'columns[10][name]': '', 'columns[10][searchable]': 'true', 'columns[10][orderable]': 'true', 'columns[10][search][value]': '', 'columns[10][search][regex]': 'false', 'columns[11][data]': 'assistsPergame', 'columns[11][name]': '', 'columns[11][searchable]': 'true', 'columns[11][orderable]': 'true', 'columns[11][search][value]': '', 'columns[11][search][regex]': 'false', 'columns[12][data]': 'mostassists', 'columns[12][name]': '', 'columns[12][searchable]': 'true', 'columns[12][orderable]': 'true', 'columns[12][search][value]': '', 'columns[12][search][regex]': 'false', 'columns[13][data]': 'goldsPermin', 'columns[13][name]': '', 'columns[13][searchable]': 'true', 'columns[13][orderable]': 'true', 'columns[13][search][value]': '', 'columns[13][search][regex]': 'false', 'columns[14][data]': 'lasthitPermin', 'columns[14][name]': '', 'columns[14][searchable]': 'true', 'columns[14][orderable]': 'true', 'columns[14][search][value]': '', 'columns[14][search][regex]': 'false', 'columns[15][data]': 'damagetoheroPermin', 'columns[15][name]': '', 'columns[15][searchable]': 'true', 'columns[15][orderable]': 'true', 'columns[15][search][value]': '', 'columns[15][search][regex]': 'false', 'columns[16][data]': 'damagetoheroPercent', 'columns[16][name]': '', 'columns[16][searchable]': 'true', 'columns[16][orderable]': 'true', 'columns[16][search][value]': '', 'columns[16][search][regex]': 'false', 'columns[17][data]': 'damagetakenPermin', 'columns[17][name]': '', 'columns[17][searchable]': 'true', 'columns[17][orderable]': 'true', 'columns[17][search][value]': '', 'columns[17][search][regex]': 'false', 'columns[18][data]': 'damagetakenPercent', 'columns[18][name]': '', 'columns[18][searchable]': 'true', 'columns[18][orderable]': 'true', 'columns[18][search][value]': '', 'columns[18][search][regex]': 'false', 'columns[19][data]': 'wardsplacedPermin', 'columns[19][name]': '', 'columns[19][searchable]': 'true', 'columns[19][orderable]': 'true', 'columns[19][search][value]': '', 'columns[19][search][regex]': 'false', 'columns[20][data]': 'wardskilledPermin', 'columns[20][name]': '', 'columns[20][searchable]': 'true', 'columns[20][orderable]': 'true', 'columns[20][search][value]': '', 'columns[20][search][regex]': 'false', 'order[0][column]': '4', 'order[0][dir]': 'desc', 'start': '0', 'length': '20', 'search[value]': '', 'search[regex]': 'false', 'area': '', 'eid': '1065', 'type': 'player', 'gametype': '2', 'filter': '{"team":{},"player":{},"meta":{}}', } # 字典转换为 k1 = v1 & k2 = v2 data = parse.urlencode(formdata) # print(data) r = requests.post(url, cookies=myCookies, data=data, headers=headers, allow_redirects=False) r.encoding = r.apparent_encoding c = r.text # print("11111内容如下:----------------------------------------") if len(c) < 100: print('获取失败,重新获取!') return False print('获取成功!') l = json.loads(c)['data'] for i in l[:20]: print('队伍编号: {0} 队伍名: {1} 玩家名称: {2}'.format(['teamid'], i['teamname'], i['playername'])) return True def cookie_to_dic(mycookie): dic = {} for i in mycookie.split('; '): dic[i.split('=')[0]] = i.split('=')[1] return dic if __name__ == '__main__': while 1: ok = get_competition() if ok is True: break # test()
第二个LOL网页数据爬取
没有任何反爬和csrf-token认证:
from faker import Factory import requests import json f = Factory.create() def fun(): url = 'http://lol.admin.pentaq.com/api/tournament_team_data?tour=29&patch=' headers = { 'user-agent': f.user_agent() } r = requests.get(url, headers=headers) r.encoding = r.apparent_encoding c = r.text r.close() l = json.loads(c)['data']['teams_data'] for i in l[:20]: print("队伍名称: {0} 队伍ID:{1} win:{2}".format(i['team_full_name'], i['team_id'], i['win'])) if __name__ == '__main__': fun()
第三个LOL网页数据爬取
http://www.op.gg/champion/statistics
采用BeautifulSoup 即可。
from faker import Factory import requests from bs4 import BeautifulSoup f = Factory.create() def fun(): url = 'http://www.op.gg/champion/statistics' headers = { 'user-agent': f.user_agent(), 'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8'" } r = requests.get(url, headers=headers) r.encoding = r.apparent_encoding if r.status_code != 200: return False c = r.text r.close() # print(c) if len(c) < 10000: return False html = BeautifulSoup(c, 'html.parser') l = html.find('tbody', class_='tabItem champion-trend-tier-TOP').find_all('tr') for x in l[:5]: a = x.find_all('td') tmp = a[3] b = tmp.find_all('div') name = b[0].text pos = b[1].text.replace('\t','').replace('\n','') print('rank: {0} name: {1} pos:{2} 胜率:{3} 登场率:{4}'.format(a[0].text, name, pos, a[4].text, a[5].text)) return True # for c in l[:20]: # a = c.find_all('td') # tmp = a[3] # b = tmp.find_all('div') # name = b[0].text # pos = b[1].text # print('rank: {0] name: {1} pos:{2} 胜率:{3} 登场率:{4}'.format(a[0].text,name,pos,a[4].text,a[5].text)) if __name__ == '__main__': while True: ok = fun() if ok: break
4.多线程爬取LOL英雄皮肤图片
1.获取对应英雄url 列表,函数get_url_list()
2.下载对应的图片保存到文件夹download()
3.main()开启多线程执行爬取任务
import requests import json import os from faker import Factory from multiprocessing.dummy import Pool as ThreadPool import time f = Factory.create() headers = { 'user-agent': f.user_agent() } def get_url_list(): url = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js' r = requests.get(url, headers=headers) r.encoding = r.apparent_encoding c = r.text Heros = json.loads(c)["hero"] # 156个hero信息 idList = [] for hero in Heros: hero_id = hero["heroId"] idList.append(hero_id) # print(idList) def spider(url): r = requests.get(url, headers=headers) r.encoding = r.apparent_encoding c = r.text r.close() res_dict = json.loads(c) skins = res_dict["skins"] # 15个hero信息 for index, hero in enumerate(skins): # 这里使用到enumerate获取下标,以便文件图片命名; item = { } # 字典对象 item['name'] = hero["heroName"] item['skin_name'] = hero["name"] if hero["mainImg"] == '': continue item['imgLink'] = hero["mainImg"] # print(item) download(index + 1, item) def download(index, contdict): name = contdict['name'] path = "皮肤/" + name if not os.path.exists(path): os.makedirs(path) content = requests.get(contdict['imgLink'], headers=headers).content with open('./皮肤/' + name + '/' + contdict['skin_name'] + str(index) + '.jpg', 'wb') as f: f.write(content) def main(): start = time.time() pool = ThreadPool(6) page = [] for i in range(1, 11): newpage = 'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js'.format(i) print(newpage) page.append(newpage) result = pool.map(spider, page) pool.close() pool.join() end = time.time() print('用时:', end-start) if __name__ == '__main__': main()