python爬虫乱记录
为了搞网络编程大作业,本来想在网上抄个爬图片作业二应付过去的,后来觉得爬虫挺重要的,于是一无所知的我去b站学了快一个星期,https://www.bilibili.com/video/av50730537/?p=84&t=277,只看到82章,scrapy配了一天失败了,接下来还要去搞编译原理大作业。
这是一些脚本的记录,就都扔这里了。
json
#encoding:utf-8
import json
# json_str='[{"username": "张三", "age": 18, "country": "china"}, {"username": "李四", "age": 20, "country": "china"}]'
# persons=json.loads(json_str)
# print(type(persons))
# for person in presons:
# print(persons)
#
with open('person.json','r',encoding='utf-8') as fp:
persons=json.load(fp)
print(type(persons))
for person in persons:
print(person)
#encoding:utf-8
import csv
def read_csv_demo1():
with open('stock.csv','r') as fp:
#reader是一个迭代器
reader=csv.reader(fp)
next(reader)
for x in reader:
name=x[3]
volumn=x[-1]
print({'name':name,'volumn':volumn})
def read_csv_demo2():
with open('stock.csv','r') as fp:
#使用DictReader创建的reader对象
#不会包含标题那行的数据
#reader是一个迭代器,遍历这个迭代器,返回来的是一个字典
reader=csv.DictReader(fp)
for x in reader:
value={"name":x[''],'volumn':x['turnoverVol']}
print(value)
if __name__=='__main__':
read_csv_demo2()`
import csv
def read_csv_demo1():
with open('stock.csv','r') as fp:
#reader是一个迭代器
reader=csv.reader(fp)
next(reader)
for x in reader:
name=x[3]
volumn=x[-1]
print({'name':name,'volumn':volumn})
def read_csv_demo2():
with open('stock.csv','r') as fp:
#使用DictReader创建的reader对象
#不会包含标题那行的数据
#reader是一个迭代器,遍历这个迭代器,返回来的是一个字典
reader=csv.DictReader(fp)
for x in reader:
value={"name":x[''],'volumn':x['turnoverVol']}
print(value)
if __name__=='__main__':
read_csv_demo2()
import json
persons=[
{ 'username':"张三",
'age':18,
'country':'china'
},
{
'username':'李四',
'age':20,
'country':'china'
}
]
class Person(objectt):
country='china'
# json_str=json.dumps(persons)
# with open('person.json','w',encoding='utf-8') as fp:
# # fp.write(json_str)
# json.dump(persons,fp,ensure_ascii=False)
#
# a={
# 'person':Person()
# }
# json.dumps(a)
thread
#encoding:utf-8
import threading
import time
def coding():
for x in range(3):
print('正在写代码%s'%threadign.current_thread())
time.sleep(1)
def drawing():
for x in range(3):
print('正在画图%s'%threading.current_thread())
time.sleep(1)
def main():
t1=threading.Thread(target=coding)
t2=threading.Thread(target=drawing)
t1.start()
t2.start()
print(threading.enumerate())
if __name__ == '__main__':
main()
#encoding:utf-8
import threading
import time
class CodingThread(threading.Thread):
def run(self):
for x in range(3):
print('正在写代码%s'%threading.current_thread())
time.sleep(1)
class DrawingThread(threading.Thread):
def run(self):
for x in range(3):
print('正在写代码%s' % threading.current_thread())
time.sleep(1)
def main():
t1=CodingThread()
t2=DrawingThread()
t1.start()
t2.start()
if __name__ == '__main__':
main()
#encoding:utf-8
import threading
VALUE=0
gLock=threading.Lock()
def add_value():
global VALUE
gLock.acquire()
for x in range(1000000):
VALUE+=1
gLock.release()
print('value: %d'%VALUE)
def main():
for x in range(2):
t=threading.Thread(target=add_value)
t.start()
if __name__ == '__main__':
main()
import threading
import random
import time
gMoney=1000
gCondition=threading.Condition()
gLock=threading.Lock()
gTotalTimes=10
gTimes=0
class Producer(threading.Thread):
def run(self):
global gMoney
global gTimes
while True:
money=random.randint(100,1000)
gCondition.acquire()
if gMoney<money:
gCondition.wait()
gMoney-=money;
printf("%s消费了%d元钱,剩余%d元钱"%(threading.current_thread(),money,gMoney))
gCondition.release()
time.sleep(0.5)
class Consumer(threading.Thread):
def run(self):
global gMoney
global gTimes
while True:
money=random.randint(100,1000)
gLock.acquire()
if gMoney>=money:
gMoney-=money
print('%s消费者消费了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
gTimes+=1
notify
notify_all
print('%s消费者消费了%d元钱,剩余%d元钱,不足!'%(threading.current_thread(),money,gMoney))
gLock.release()
time.sleep(0.5)
def main():
for x in range(5):
t=Producer(name="生产者线程%d"%x)
t.start()
for x in range(3):
t=Consumer(name='消费者线程%d'%x)
t.start()
if __name__ == '__main__':
main()
#encoding:utf-8
from queue import Queue
import time
q=Queue(4)
def set_value(q):
index=0
while True:
q.put(index)
index+=1
time.sleep(3)
def get_value(q):
while True:
print(q.get())
def main():
q=Queue(4)
t1=threading.Thread(target=set_value,args=[q])
t2=threading.Thread(target=get_value,args=[q])
t1.start()
t2.start()
if __name__ == '__main__':
main()
#encoding:utf-8
from queue import Queue
import time
q=Queue(4)
def set_value(q):
index=0
while True:
q.put(index)
index+=1
time.sleep(3)
def get_value(q):
while True:
print(q.get())
def main():
q=Queue(4)
t1=threading.Thread(target=set_value,args=[q])
t2=threading.Thread(target=get_value,args=[q])
t1.start()
t2.start()
if __name__ == '__main__':
main()
#encoding:utf-8
from selenium import webdriver
import time
driver_path=r"C:\Users\ASUS\AppData\Local\Temp\geckodriver-v0.24.0-win64-3"
driver=webdriver.Firefox(executable_path=driver_path)
driver.get('https://www.baidu.com/')
print(driver.page_source)
for cookie in driver.get_cookies():
print(cookie)
print('='*30)
print(driver.get_cookie('PSTM'))
# #encoding:utf-8
#
# from selenium import webdriver
# from selenium.webdriver.support.ui import webDriverWait
# from selenium.webdriver.support import expected_condition as EC
# from selenium.webdriver.common.by import By
#
# driver_path=r"D:\ProgramApp\chromedriver\chromedriver.ext"
# driver=Webdriver.Chrme(executable_path=driver_path)
# driver.get('http://www.douban.com/')
#
#
# driver,implicitly_wait(20)
#
# WebDriverWait(driver,10).until(
# EC.presence_of_element_located((By.ID,'sfasfsd'))
# )
#encodinf:utf-8
from selenium import webdriver
driver_path=r"D:\ProgramApp\chromedriver\chromedriver.exe"
options=webdriver.ChromeOptions()
options.add_argument("--proxy-server=http://49.85.14.35:28149")
driver=webdriver.Chrome(executable_path=driver_path,chrome_options=options)
submitBtn=driver.find_element_by_id('su')
print(type(submitBtn))
print(submitBtn.get_attribute("value"))
driver.save_screenshot('baidu.png')
driver.get()
正则表达式
#encoding:utf-8
import re
# #1.匹配某个字符串
# text="ahello"
# ret=re.search('he',text)
# print(ret.group())
# 2.点:匹配任意的字符
# text="\n"
# ret=re.match('.',text)
# print(ret.group())
# 3.\d:匹配任意的数字(0-9)
# text="a"
# ret=re.match('\d',text)
# print(ret.group())
# 4.\D:匹配任意的非数字
#text="2"
#ret=re.match('\D',text)
#print(ret.group())
#5.\s:匹配空白字符(\n,\t,\r,空格)
# text="\r"
# ret=re.match('\s',text)
# print(ret.group())
# 6.\w:匹配的是a-z,A-Z,数字和下划线
# text="+"
# ret=re.match('\w',text)
# print(ret.group())
# 7.\W:与\w相反。
# text="a"
# ret=re.match('\w',text)
# print(ret.group())
#8.[]组合的方式,只要满足中括号中的字符,就可以匹配
# text="09"
# ret=re.match('[^0-9]',text)
# print(ret.group())
#8.2。中括号的形式代替\D
# text="_"
# ret=re.match('[a-zA-Z0-9_]',text)
# print(ret.group())
# 8.4.中括号的形式代替\W
# text="_"
# ret=re.match('[^a-zA-Z0-9_]',text)
# print(ret.group())
#####匹配多个字符#####
# 9. *:可以匹配0或者任意多个字符
# text="abcd"
# ret=re.match('\s*',text)
# print(ret.group())
#10.匹配一个或者多个字符
# text="+abcd"
# ret=re.match('\w+',text)
# print(ret.group())
#
# 11.?:匹配1个或者0个(要么没有,要么就只有一个)
# text="abcd"
# ret=re.match('\w?',text)
# print(ret.group())
# 12.(m):匹配m个字符:
# text="abcd"
# ret=re.match('\w{2}',text)
# print(ret.group())
# 13.{m,n,}:匹配m-n个字符
# text="abcda"
# ret=re.match('\w{1,5}',text)
# print(ret.group())
# 14.验证手机号码:
# text="18578900987"
# ret=re.match('1[34578]\d{9}',text)
# print(ret.group())
# 15.验证邮箱
# text="hynever12_@qq.com"
# ret=re.match('\w+@[a-z0-9]+\.[a-z]+',text)
# print(ret.group())
#
#
# 16.验证url
# text="https://baike.baidu.com/item/Python/407313?fr=aladdin"
# ret=re.match('(http|https|ftp)://[^s]+',text)
# print(ret.group())
#
# 17.验证身份证
# text="3113111890812323X"
#
# ret=re.match('\d{17}[\dxX]',text)
# print(re.group())
# 18.^(脱字号):
# text="hello"
# ret=re.search('^h',text)
# print(ret.group())
#19. $:表示以...结尾:
# text="xxx@163.com"
# ret=re.match('\w+@163.com$',text)
# print(ret.group())
#20. |:匹配多个字符串或者表达式
# text="httpsdfdas"
# ret=re.match('(ftp|http|https)$',text)
# print(ret.group())
#21.贪婪模式与非贪婪模式:
# text="0123456"
# ret=re.match('\d+?',text)
# print(ret.group())
# text="<h1>标题<h1>"
# ret=re.match('<.+?>',text)
# print(ret.group())
#22匹配0--100之间的数字
#1,2,3,10
#可以出现的:1,2,3,10,100,99
#有三种情况:1,99,100
#不可以出现#的:09,101
text="01"
ret=re.match('[1-9]\d?$|100$',text)
print(ret.group())
#encoding:utf-8
import re
# text="apple price is $299"
# ret=re.search("\$\d+",text)
# print(ret.group())
#r=raw=原生的
# text=r'\n'
# print(text)
text="\c" #='\n'
#python:'\\n'=\n
#\\\\n=>\\n
#正则表达式中:\n=
#\\n=>\n
#\\c=>\c
# ret=re.match(r'\\c',text)
# print(ret.group())
#分组
# text="apple's price $99,orange's price is $10"
# ret=re.search('.*(\$\d+).*(\$\d+)',text)
# print(ret.group(0))
# #ret.group(0)=ret.group()
# print(ret.group(1))
# print(ret.group(2))
# print(ret.group(1,2))
# #所有的子分组都拿出来
# print(ret.groups())
#find_all函数
# text="apple's price $99,orange's price is $10"
# ret=re.findall('\$\d+',text)
# print(ret)
html="""
<dd class="job_bt">
<h3 class="description">职位描述:</h3>
<div class="job-detail">
<p>职位描述</p>
<p>工作职责(包括但不限于):</p>
<p>1、负责全网视频数据的抓取、去重、识别等工作;</p>
<p>2、负责核心数据抓取及存储系统的架构设计、优化;</p>
<p><br></p>
<p>任职资格:</p>
<p>1.本科及以上学历,2年左右相关工作经验, 有扎实的数据结构和算法功底,</p>
<p>2.熟悉linux开发环境,熟练掌握Python编程语言;</p>
<p>3.有爬虫,信息抽取,文本分类相关经验者优先 ;</p>
<p>4.工作认真细致踏实,较强的学习能力、分析解决问题能力</p>
</div>
</dd>
"""
# #sub函数:
# ret=re.sub('<.+?>',"",html)
# print(ret)
#split函数
# text="hello world ni hao"
# ret=re.split('[^a-zA-Z]]',text)
# print(ret)
text="the number is 20.50"
# r=re.compile('.\d+\.?\d*')
r=re.compile(r"""
\d+#小数点前面的数字
\.? #小数点本身
\d* #小数点后面的数字
""",re.VERBOSE)
ret=re.search(r,text)
print(ret.group())
#encoding:utf-8
import requests
import re
def parse_page(url):
headers={
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400"
}
response=requests.get(url,headers)
text=response.text
titles=['a','b']
dynasties=['唐朝','清代']
authors=['伪装','纳兰性德']
poems=['xxx','yuyy']
titles=re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)
dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
content_tags=re.findall(r'<div calss="contson".*?>(.*?)</div>',text,re.DOTALL)
for content in content_tags:
x=re.sub(r'<.*?>',"",content)
contents.append(x.strip())
for value in zip(titles,dynasties,authors,contents):
title,dynasty,author,content=value
poem={
'title':title,
'dynasty':dynasty,
'author':author,
'content':content
}
poem.append(poem)
for poem in poems:
print(poem)
print('='*40)
a=(1,2,3)
a,b,c=value
def main():
url='https://www.gushiwen.org/default_1.aspx'
for x in range(1,11):
url=""
parse_page(url)
if __name__ == '__main__':
main()
#encoding: utf-8
import requests
url="http://www.renren.com/PLogin.do"
data={"email":"1142883959@qq.com",'password':"mc883228"}
headers={
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400"
}
session=requests.Session()
session.post(url,data=data,headers=headers)
response=session.get('http://www.renren.com/880151247/profile')
with open('renren.html','w',encoding='utf-8') as fp:
fp.write(response.text)
# #encoding: utf-8
# from lxml import etree
# parser=etree.HTMLParser(encoding='utf-8')
# html=etree.parse("tencent.html",parser=parser)
#
#
# #//1.获取所有tr标签
# #//tr
# #//xpath函数返回的是一个列表
# trs=html.xpath("//tr[2]")[0]
# print(trs)
from urllib import
#encoding: utf-8
import requests
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400',
'Referer':"https://careers.tencent.com/citymain.html",
'Accept':"image/webp,image/apng,image/*,*/*;q=0.8",
'Accept-Encoding':"gzip, deflate, br",
'Accept-Language': "zh-CN,zh;q=0.9",
'Connection':" keep-alive",
'Host': "cdn.multilingualres.hr.tencent.com",
'If-Modified-Since': "Mon, 22 Apr 2019 20:20:23 GMT"
}
responce=requests.get("https://careers.tencent.com/search.html?pcid=40001",headers=headers)
with open('tencent.html','w',encoding='utf-8') as fp:
fp.write(responce.content.decode('utf-8'))
print(responce.url)
spider
#encoding: utf-8
import requests
from bs4 import BeautifulSoup
def parse_page(url):
headers={
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400"
}
response=requests.get(url,headers=headers)
text=response.content.decode('utf-8')
soup=BeautifulSoup(text,'html5lib')
conMidtab = soup.find('div',class_='conMidtab')
tables=conMidtab.find_all('table')
for table in tables:
trs=table.find_all('tr')[2:]
for index,tr in enumerate(trs):
tds=tr.find_all('td')
city_td=tds[0]
if index==0:
city_td=tds[1]
city=list(city_td.stripped_strings)[0]
temp_td=tds[-2]
min_temp=list(temp_td.stripped_strings)[0]
print({'city':city,"min_temp":min_temp})
def main():
url='http://www.weather.com.cn/textFC/gat.shtml'
parse_page(url)
if __name__ == '__main__':
main()
# #encoding: utf-8
#
# from bs4 import BeautifulSoup
#
# html="""
#
#
#
# """
# bs=BeautifulSoup(html,"lxml")
# print(bs.prettify())
#
# 1.获取所有tr标签
# trs=soup.find_all('tr')
# for tr in trs:
# print(type(tr))
# break
# from bs4.element import Tag
#
# 2获取第2个tr标签
# tr=soup.find_all('tr',limit=2)[1]
# print tr
# 3.获取所有class等于even的tr标签
#
# trs=soup.find_all('tr',attrs={'class':'even'})
# for tr in trs:
# print(tr)
# print('='*30)
#
#
# 4.将所有id等于test,class也等于test的a标签提取出来
# aList=soup.find_all('a',id='test',class_='test',"class":"test")
# for a in aList:
# print(a)
#
#
# 5.获取所有a标签的href属性
# aList=soup.find_all('a')
# for a in aList:
# href=a.attrs['href']
# print(href)
#
# 6.获取所有的职位信息(纯文本)
# trs=soup.find_all('tr')[1:]
# movies=[]
# for tr in trs:
# movie={ }
# tds=tr.find_all("td")
# title=tds[0]
# print(title.string)
# category=tds[1].string
# nums=tds[2].string
# city=tds[3].string
# pubtime=tds[4].string
# movie['title']=title
# movie['category']=category
# movie['nums']=nums
# movie['city']=city
# movie['pubtime']=pubtime
# movies.append(movie)
#
# info=list(tr.stripped_strings)
# movie['title']=infos[0]
# movie['category']=infos[1]
# movie['nums']=infos[2]
# movie['city']=infos[3]
# movie['pubtime']=infos[4]
# movies.append(movie)
#
# print (movies)
a=soup.find('a')
print(a)
tr=soup.find_all('tr')[1]
text=List(tr.strings)
print(text)
#encoding: urf-8
soup=BeautifulSoup(html,'lxml')
#1.获取所有tr标签
trs=soup.select("tr{}")
trs=soup.select("tr")
for tr in trs:
print(type(tr))
print("="*30)
break
#2.获取第二个tr标签
tr=soup.select('tr')[1]
print(tr)
#3.获取所有class等于even的tr标签
trs=soup.select("tr[class='even]")
for tr in trs:
print (tr)
aList=soup.select('a')
for a in aList:
href=a['href']
print(href)
trs=soup.select('tr')
for tr in trs:
infos=list(tr.stripped_strings)
print(infos)
#encoding: utf-8
from bs4 import BeautifulSoup
html="""
"""
from bs4.element import Tag
from bs4.element import NavigableString
from bs4.element import Comment
soup=BeautifulSoup(html,'lxml')
table soup.find('table')
print(type(div.children))
import requests
from bs4 import BeautifulSoup
import html5lib
from pyecharts import Bar
ALL_DATA = []
def parse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
}
response = requests.get(url)
text = response.content.decode('utf-8')
# 需要用到html5lib解析器,去补全html标签
soup = BeautifulSoup(text,'html5lib')
conMidtab = soup.find('div',class_='conMidtab')
tables = conMidtab.find_all('table')
for table in tables:
trs = table.find_all('tr')[2:]
for index,tr in enumerate(trs):
tds = tr.find_all('td')
city_td = tds[0]
if index == 0:
city_td = tds[1]
city = list(city_td.stripped_strings)[0]
temp_td = tds[-2]
temp = list(temp_td.stripped_strings)[0]
# print({'city':city,'temp':int(temp)})
ALL_DATA.append({'city':city,'temp':int(temp)})
def main():
url_list = [
'http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml',
]
for url in url_list:
parse_page(url)
#按天气最低进行排序,并只取10个
ALL_DATA.sort(key=lambda data:data['temp'])
data = ALL_DATA[0:10]
#分别取出所有城市和温度
cities = list(map(lambda x:x['city'],data))
temps = list(map(lambda x:x['temp'],data))
chart = Bar("中国天气最低气温排行榜")
chart.add('',cities,temps)
chart.render('temperature.html')
if __name__ == '__main__':
main()
#encoding: utf-8
#1.将目标网站上的页面抓取下来
import requests
from lxml import etree
headers={
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400",
'Referer':'https://movie.douban.com/'
}
#2.将抓取下来的数据根据一定的规则进行提取
url='https://movie.douban.com/cinema/nowplaying/changsha/'
response=requests.get(url,headers=headers)
text=response.text
html=etree.HTML(text)
ul=html.xpath("//ul[@class='lists']")[0]
lis=ul.xpath("./li")
movies=[]
for li in lis:
title=li.xpath("@data-title")[0]
score=li.xpath("@data-score")[0]
duration=li.xpath("@data-duration")[0]
region=li.xpath("@data-region")[0]
director=li.xpath("@data-director")[0]
actors=li.xpath("@data-actors")[0]
thumbnail=li.xpath(".//img/@src")
movie={
'title':title,
'score':score,
'duration':duration,
'region':region,
'director':director,
'actors':actors,
'thumbnail':thumbnail
}
movies.append(movie)
print(movies)
# print(etree.tostring(ul,encoding='utf-8').decode("utf-8"))
#response.`
#encoding: utf-8
from lxml import etree
import requests
BASE_DOMAIN='http://www.ygdy8.net'
url="http://www.ygdy8.net/html/gndy/dyzz/list_23_2.html"
headers={
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25"
}
def get_detail_urls(url):
response=requests.get(url,headers=headers)
text=response.text.encode('utf-8')
# text=response.content.decode('gbk')
html=etree.HTML(text)
detail_urls=html.xpath("//table[@class='tbspan']//a/@href")
detail_urls=map(lambda url:BASE_DOMAIN+url,detail_urls)
return detail_urls
def parse_detail_page(url):
movie={}
response=requests.get(url,headers=headers)
text=response.content.decode('gbk')
html=etree.HTML(text)
title=html.xpath("//vis[@class=title_all']//font[@color='#07519a']/text()")[0]
movie['title']=title
zoomE=html.xpath("//div[@id='Zoom']")[0]
imgs=zoomE.xpath(".//img/@src")
cover=imgs[0]
screenshot=imgs[1]
movie['cover']=cover
movie['screenshot']=screenshot
def parse_info(info,rule):
return info.replace(rule,"").strip()
infos=zoomE.xpath(".//text()")
for index,info in enumerate(infos):
if info.startswith(""):
info=info.replace("").strip()
movie['year']=info
elif info.startwith(""):
info=info.replace(" ").strip()
movie['country']=info
elif info.startwith("")
info=info.replace(" ","").strip()
movie['category']=info
elif info.startwith("")
info=parse_info(info," ")
movie['douban_rating']=info
elif info.startwith(info,"")
info=parse_info(info,"")
movie['duration']=info
elif info.startwith(info,"")
info=parse_info(info,"")
movie['director']=info
elif info.startwith("")
info=parse_info(info,"")
actors=[info]
for x in range(index+1,len(infos)):
actor=infos[x].strip()
if actor.startswith(""):
break
actors.append(actor)
movie['actors']=actors
elif info.startwith("")
info=parse_info(info,)
for x in range(index+1,len(infos))
proflie=infos[x].strip()
movie["profile"]=profile
download_url=html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
movie['download_url']=download_url
return movie
def spider():
base_url="http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
movies=[]
for x in range(1,8):
# 第一个for循环,是用来控制总共有7页的
url=base_url.format(x)
detail_urls=get_detail_urls(url)
for detail_url in detail_urls:
#第二个for循环,是用来遍历一页中所有电影的详情url
movie=parse_detail_page(detail_url)
movies.append(movie)
print(movie)
if __name__ == '__main__':
spider()
# #encoding: utf-8
# import requests
#
# responce=requests.get("http://www.baidu.com/")
# print(type(responce.content))
# print(responce.content.decode('utf-8'))
#
#
# print(responce.url)
# print(responce.encoding)
# print(responce.status_code)
#encoding: utf-8
import requests
params={
'wd':'中国'
}
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400'
}
responce=requests.get("https://www.baidu.com/s",params=params,headers=headers)
with open('baidu.html','w',encoding='utf-8') as fp:
fp.write(responce.content.decode('utf-8'))
print(responce.url)
#encoding utf-8
import requests
proxy={yu
'http':'182.34.34.191:9999'
}
response=requests.get("http://httpbin.org/ip",proxies=proxy)
print(response.text)
parser=etree.HTMLParser()
html=etree.parse("tencent.html")
trs=html.xpath("//tr")
for tr in trs:
print(etree.tostring()
)
2.获取第2个tr标签
tr=html.xpath("//tr[2]")[0]
print(tr)
3.获取所有class等于even的tr标签
trs=html.xpath("//tr[@class=='even]")
for tr in trs:
print (tr)
4.获取所有a标签的href属性
as=html.xpath("//a/@href")
for a in aList:
print("http://hr.tencent.com/"+a)
5.获取所有的职位信息(纯文本)
trs=html.xpath("//tr[positiion(]>1")
positions=[]
for tr in trs:
href=tr.xpath(".//a")
fullurl='http://hr.tencent.com/'+href
title=tr.xpath(".td[1]//text()")[0]
category=tr.xpath("./td[2]/text()")[0]
print(category)
u=tr.xpath("./td[3]/text()")[0]
pubtime(pubtime)
print(address)
position={
'url':fullurl,
'title':title,
'category':category,
'nums':nums,
'address':address,
'pubtime':pubtime
}
positions.append(position)
break
#encoding: utf-8
import requests
url="http://www.renren.com/PLogin.do"
data={"email":"970138079@qq.com",'password':"pythonspider"}
headers={
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400"
}
session=requests.Session()
session.post(url,data=data,headers=headers)
response=session.get('http://www.renren.com/880151247/profile')
with open('renren.html','w',encoding='utf-8') as fp:
fp.write(response.text)
#encoding: utf-8
from urllib import parse
from urllib import request
#urlretrive函数的用法
#request.urlretrieve('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1559904206&di=80bc57b1e2a8b53b00ab9adc411923e9&imgtype=jpg&er=1&src=http%3A%2F%2Fb-ssl.duitang.com%2Fuploads%2Fitem%2F201706%2F30%2F20170630005238_ztsxW.jpeg','luban.jpg')
#urlencode函数的用法
# params={'name':'张三',"age":18,'greet':'hello world'}
#result=parse.urlencode(params)
#print(result)
#
# url='http://wwww.baidu.com/s'
# params={"wd":"刘德华"}
# qs=parse.urlencode(params)
# url=url+"?"+qs
# resp=request.urlopen(url)
# print(resp.read())
# #resp=request.urlopen(url)
# #print(resp.read())
params={'name':'张三',"age":18,'greet':'hello world'}
qs=parse.urlencode(params)
print(qs)
result=parse.parse_qs(qs)
print(result)