python爬虫乱记录

为了搞网络编程大作业,本来想在网上抄个爬图片作业二应付过去的,后来觉得爬虫挺重要的,于是一无所知的我去b站学了快一个星期,https://www.bilibili.com/video/av50730537/?p=84&t=277,只看到82章,scrapy配了一天失败了,接下来还要去搞编译原理大作业。
这是一些脚本的记录,就都扔这里了。

json

#encoding:utf-8
import json
# json_str='[{"username": "张三", "age": 18, "country": "china"}, {"username": "李四", "age": 20, "country": "china"}]'
# persons=json.loads(json_str)
# print(type(persons))
# for person in presons:
#     print(persons)
#

with open('person.json','r',encoding='utf-8') as fp:
    persons=json.load(fp)
    print(type(persons))
    for person in persons:
        print(person)
#encoding:utf-8
import csv

def read_csv_demo1():
    with open('stock.csv','r') as fp:
       #reader是一个迭代器
        reader=csv.reader(fp)
        next(reader)
        for x in reader:
            name=x[3]
            volumn=x[-1]
            print({'name':name,'volumn':volumn})

def read_csv_demo2():
    with open('stock.csv','r') as fp:
        #使用DictReader创建的reader对象
        #不会包含标题那行的数据
        #reader是一个迭代器,遍历这个迭代器,返回来的是一个字典
        reader=csv.DictReader(fp)
        for x in reader:
            value={"name":x[''],'volumn':x['turnoverVol']}
            print(value)
if __name__=='__main__':
    read_csv_demo2()`
import csv

def read_csv_demo1():
    with open('stock.csv','r') as fp:
       #reader是一个迭代器
        reader=csv.reader(fp)
        next(reader)
        for x in reader:
            name=x[3]
            volumn=x[-1]
            print({'name':name,'volumn':volumn})

def read_csv_demo2():
    with open('stock.csv','r') as fp:
        #使用DictReader创建的reader对象
        #不会包含标题那行的数据
        #reader是一个迭代器,遍历这个迭代器,返回来的是一个字典
        reader=csv.DictReader(fp)
        for x in reader:
            value={"name":x[''],'volumn':x['turnoverVol']}
            print(value)
if __name__=='__main__':
    read_csv_demo2()


import json

persons=[
    {  'username':"张三",
       'age':18,
       'country':'china'
     },
    {
       'username':'李四',
       'age':20,
       'country':'china'
    }
]

class Person(objectt):
    country='china'

# json_str=json.dumps(persons)
# with open('person.json','w',encoding='utf-8') as fp:
#     # fp.write(json_str)
#     json.dump(persons,fp,ensure_ascii=False)
#
# a={
#     'person':Person()
# }
# json.dumps(a)



thread
#encoding:utf-8
import threading
import time

def coding():
    for x in range(3):
        print('正在写代码%s'%threadign.current_thread())
        time.sleep(1)

def drawing():
    for x in range(3):
        print('正在画图%s'%threading.current_thread())
        time.sleep(1)

def main():
    t1=threading.Thread(target=coding)
    t2=threading.Thread(target=drawing)

    t1.start()
    t2.start()
    print(threading.enumerate())

if __name__ == '__main__':
    main()



#encoding:utf-8
import threading
import time
class CodingThread(threading.Thread):
    def run(self):
        for x in range(3):
            print('正在写代码%s'%threading.current_thread())
            time.sleep(1)

class DrawingThread(threading.Thread):
    def run(self):
        for x in range(3):
            print('正在写代码%s' % threading.current_thread())
            time.sleep(1)

def main():
    t1=CodingThread()
    t2=DrawingThread()

    t1.start()
    t2.start()

if __name__ == '__main__':
    main()
#encoding:utf-8
import threading

VALUE=0
gLock=threading.Lock()

def add_value():
    global VALUE
    gLock.acquire()
    for x in range(1000000):
        VALUE+=1
    gLock.release()
    print('value: %d'%VALUE)

def main():
    for x in range(2):
        t=threading.Thread(target=add_value)
        t.start()

if __name__ == '__main__':
    main()


import threading
import random
import time

gMoney=1000
gCondition=threading.Condition()
gLock=threading.Lock()
gTotalTimes=10
gTimes=0


class Producer(threading.Thread):
    def run(self):
        global gMoney
        global gTimes
        while True:
            money=random.randint(100,1000)
            gCondition.acquire()
            if gMoney<money:
                gCondition.wait()
            gMoney-=money;
            printf("%s消费了%d元钱,剩余%d元钱"%(threading.current_thread(),money,gMoney))
            gCondition.release()
            time.sleep(0.5)

class Consumer(threading.Thread):
    def run(self):
        global gMoney
        global gTimes
        while True:
            money=random.randint(100,1000)
            gLock.acquire()
            if gMoney>=money:
                gMoney-=money
                print('%s消费者消费了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
                gTimes+=1
                notify
                notify_all
                print('%s消费者消费了%d元钱,剩余%d元钱,不足!'%(threading.current_thread(),money,gMoney))
                gLock.release()
                time.sleep(0.5)

def main():
    for x in range(5):
        t=Producer(name="生产者线程%d"%x)
        t.start()

    for x in range(3):
        t=Consumer(name='消费者线程%d'%x)
        t.start()

if __name__ == '__main__':
    main()


#encoding:utf-8

from queue import Queue
import time

q=Queue(4)


def  set_value(q):
     index=0
     while True:
        q.put(index)
        index+=1
        time.sleep(3)

def  get_value(q):
     while True:
        print(q.get())

def main():
    q=Queue(4)
    t1=threading.Thread(target=set_value,args=[q])
    t2=threading.Thread(target=get_value,args=[q])

    t1.start()
    t2.start()

if __name__ == '__main__':
    main()
#encoding:utf-8

from queue import Queue
import time

q=Queue(4)


def  set_value(q):
     index=0
     while True:
        q.put(index)
        index+=1
        time.sleep(3)

def  get_value(q):
     while True:
        print(q.get())

def main():
    q=Queue(4)
    t1=threading.Thread(target=set_value,args=[q])
    t2=threading.Thread(target=get_value,args=[q])

    t1.start()
    t2.start()

if __name__ == '__main__':
    main()
#encoding:utf-8

from selenium import webdriver
import time


driver_path=r"C:\Users\ASUS\AppData\Local\Temp\geckodriver-v0.24.0-win64-3"
driver=webdriver.Firefox(executable_path=driver_path)
driver.get('https://www.baidu.com/')
print(driver.page_source)


for cookie in driver.get_cookies():
    print(cookie)
print('='*30)
print(driver.get_cookie('PSTM'))
# #encoding:utf-8
#
# from selenium import webdriver
# from selenium.webdriver.support.ui import webDriverWait
# from selenium.webdriver.support import expected_condition as EC
# from selenium.webdriver.common.by import By
#
# driver_path=r"D:\ProgramApp\chromedriver\chromedriver.ext"
# driver=Webdriver.Chrme(executable_path=driver_path)
# driver.get('http://www.douban.com/')
#
#
# driver,implicitly_wait(20)
#
# WebDriverWait(driver,10).until(
#     EC.presence_of_element_located((By.ID,'sfasfsd'))
# )

#encodinf:utf-8
from selenium import webdriver
driver_path=r"D:\ProgramApp\chromedriver\chromedriver.exe"
options=webdriver.ChromeOptions()
options.add_argument("--proxy-server=http://49.85.14.35:28149")
driver=webdriver.Chrome(executable_path=driver_path,chrome_options=options)


submitBtn=driver.find_element_by_id('su')
print(type(submitBtn))
print(submitBtn.get_attribute("value"))
driver.save_screenshot('baidu.png')

driver.get()
正则表达式
#encoding:utf-8
import re
# #1.匹配某个字符串
# text="ahello"
# ret=re.search('he',text)
# print(ret.group())
# 2.点:匹配任意的字符
# text="\n"
# ret=re.match('.',text)
# print(ret.group())


# 3.\d:匹配任意的数字(0-9)
# text="a"
# ret=re.match('\d',text)
# print(ret.group())

# 4.\D:匹配任意的非数字
#text="2"
#ret=re.match('\D',text)
#print(ret.group())

#5.\s:匹配空白字符(\n,\t,\r,空格)

# text="\r"
# ret=re.match('\s',text)
# print(ret.group())


# 6.\w:匹配的是a-z,A-Z,数字和下划线
# text="+"
# ret=re.match('\w',text)
# print(ret.group())

# 7.\W:与\w相反。
# text="a"
# ret=re.match('\w',text)
# print(ret.group())


#8.[]组合的方式,只要满足中括号中的字符,就可以匹配
# text="09"
# ret=re.match('[^0-9]',text)
# print(ret.group())

#8.2。中括号的形式代替\D
# text="_"
# ret=re.match('[a-zA-Z0-9_]',text)
# print(ret.group())

# 8.4.中括号的形式代替\W
# text="_"
# ret=re.match('[^a-zA-Z0-9_]',text)
# print(ret.group())


#####匹配多个字符#####
# 9.  *:可以匹配0或者任意多个字符
# text="abcd"
# ret=re.match('\s*',text)
# print(ret.group())

#10.匹配一个或者多个字符
# text="+abcd"
# ret=re.match('\w+',text)
# print(ret.group())
#
# 11.?:匹配1个或者0个(要么没有,要么就只有一个)
# text="abcd"
# ret=re.match('\w?',text)
# print(ret.group())

# 12.(m):匹配m个字符:
# text="abcd"
# ret=re.match('\w{2}',text)
# print(ret.group())

# 13.{m,n,}:匹配m-n个字符
# text="abcda"
# ret=re.match('\w{1,5}',text)
# print(ret.group())

# 14.验证手机号码:

# text="18578900987"
# ret=re.match('1[34578]\d{9}',text)
# print(ret.group())

# 15.验证邮箱
# text="hynever12_@qq.com"
# ret=re.match('\w+@[a-z0-9]+\.[a-z]+',text)
# print(ret.group())
#
#
# 16.验证url
# text="https://baike.baidu.com/item/Python/407313?fr=aladdin"
# ret=re.match('(http|https|ftp)://[^s]+',text)
# print(ret.group())
#
# 17.验证身份证
# text="3113111890812323X"
#
# ret=re.match('\d{17}[\dxX]',text)
# print(re.group())

# 18.^(脱字号):
# text="hello"
# ret=re.search('^h',text)
# print(ret.group())

#19. $:表示以...结尾:
# text="xxx@163.com"
# ret=re.match('\w+@163.com$',text)
# print(ret.group())

#20. |:匹配多个字符串或者表达式
# text="httpsdfdas"
# ret=re.match('(ftp|http|https)$',text)
# print(ret.group())

#21.贪婪模式与非贪婪模式:
# text="0123456"
# ret=re.match('\d+?',text)
# print(ret.group())


# text="<h1>标题<h1>"
# ret=re.match('<.+?>',text)
# print(ret.group())

#22匹配0--100之间的数字
#1,2,3,10
#可以出现的:1,2,3,10,100,99
#有三种情况:1,99,100
#不可以出现#的:09,101
text="01"
ret=re.match('[1-9]\d?$|100$',text)
print(ret.group())
#encoding:utf-8
import re

# text="apple price is $299"
# ret=re.search("\$\d+",text)
# print(ret.group())
#r=raw=原生的
# text=r'\n'
# print(text)

text="\c"  #='\n'
#python:'\\n'=\n
#\\\\n=>\\n
#正则表达式中:\n=
#\\n=>\n
#\\c=>\c
# ret=re.match(r'\\c',text)
# print(ret.group())

#分组
# text="apple's price $99,orange's price is $10"
# ret=re.search('.*(\$\d+).*(\$\d+)',text)
# print(ret.group(0))
# #ret.group(0)=ret.group()
# print(ret.group(1))
# print(ret.group(2))
# print(ret.group(1,2))
# #所有的子分组都拿出来
# print(ret.groups())

#find_all函数
# text="apple's price $99,orange's price is $10"
# ret=re.findall('\$\d+',text)
# print(ret)
html="""
<dd class="job_bt">
        <h3 class="description">职位描述:</h3>
        <div class="job-detail">
        <p>职位描述</p>
<p>工作职责(包括但不限于):</p>
<p>1、负责全网视频数据的抓取、去重、识别等工作;</p>
<p>2、负责核心数据抓取及存储系统的架构设计、优化;</p>
<p><br></p>
<p>任职资格:</p>
<p>1.本科及以上学历,2年左右相关工作经验, 有扎实的数据结构和算法功底,</p>
<p>2.熟悉linux开发环境,熟练掌握Python编程语言;</p>
<p>3.有爬虫,信息抽取,文本分类相关经验者优先 ;</p>
<p>4.工作认真细致踏实,较强的学习能力、分析解决问题能力</p>
        </div>
    </dd>
"""
# #sub函数:
# ret=re.sub('<.+?>',"",html)
# print(ret)
#split函数
# text="hello world ni hao"
# ret=re.split('[^a-zA-Z]]',text)
# print(ret)

text="the number is 20.50"
# r=re.compile('.\d+\.?\d*')
r=re.compile(r"""
  \d+#小数点前面的数字
  \.? #小数点本身
  \d* #小数点后面的数字
""",re.VERBOSE)
ret=re.search(r,text)
print(ret.group())
#encoding:utf-8

import requests
import re

def parse_page(url):
    headers={
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400"
    }
    response=requests.get(url,headers)
    text=response.text
    titles=['a','b']
    dynasties=['唐朝','清代']
    authors=['伪装','纳兰性德']
    poems=['xxx','yuyy']
    titles=re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)
    dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    content_tags=re.findall(r'<div calss="contson".*?>(.*?)</div>',text,re.DOTALL)
    for content in content_tags:
        x=re.sub(r'<.*?>',"",content)
        contents.append(x.strip())

    for value in zip(titles,dynasties,authors,contents):
        title,dynasty,author,content=value
        poem={
            'title':title,
            'dynasty':dynasty,
            'author':author,
            'content':content
        }
        poem.append(poem)

    for poem in poems:
        print(poem)
        print('='*40)
    a=(1,2,3)
    a,b,c=value

def main():
    url='https://www.gushiwen.org/default_1.aspx'
    for x in range(1,11):
        url=""
    parse_page(url)

if __name__ == '__main__':
    main()
#encoding: utf-8

import requests

url="http://www.renren.com/PLogin.do"
data={"email":"1142883959@qq.com",'password':"mc883228"}
headers={
       'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400"
}

session=requests.Session()
session.post(url,data=data,headers=headers)
response=session.get('http://www.renren.com/880151247/profile')
with open('renren.html','w',encoding='utf-8') as fp:
    fp.write(response.text)
# #encoding: utf-8
# from lxml import etree
# parser=etree.HTMLParser(encoding='utf-8')
# html=etree.parse("tencent.html",parser=parser)
#
#
# #//1.获取所有tr标签
# #//tr
# #//xpath函数返回的是一个列表
# trs=html.xpath("//tr[2]")[0]
# print(trs)


from urllib import 

#encoding: utf-8
import requests

headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400',
    'Referer':"https://careers.tencent.com/citymain.html",
     'Accept':"image/webp,image/apng,image/*,*/*;q=0.8",
     'Accept-Encoding':"gzip, deflate, br",
     'Accept-Language': "zh-CN,zh;q=0.9",
     'Connection':" keep-alive",
     'Host': "cdn.multilingualres.hr.tencent.com",
     'If-Modified-Since': "Mon, 22 Apr 2019 20:20:23 GMT"
}

responce=requests.get("https://careers.tencent.com/search.html?pcid=40001",headers=headers)

with open('tencent.html','w',encoding='utf-8') as fp:
    fp.write(responce.content.decode('utf-8'))

print(responce.url)
spider
#encoding: utf-8

import requests
from bs4 import BeautifulSoup

def parse_page(url):
    headers={
        'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400"
    }
    response=requests.get(url,headers=headers)
    text=response.content.decode('utf-8')
    soup=BeautifulSoup(text,'html5lib')
    conMidtab = soup.find('div',class_='conMidtab')
    tables=conMidtab.find_all('table')
    for table in tables:
        trs=table.find_all('tr')[2:]
        for index,tr in enumerate(trs):
            tds=tr.find_all('td')
            city_td=tds[0]
            if index==0:
                city_td=tds[1]
            city=list(city_td.stripped_strings)[0]
            temp_td=tds[-2]
            min_temp=list(temp_td.stripped_strings)[0]
            print({'city':city,"min_temp":min_temp})

def main():
    url='http://www.weather.com.cn/textFC/gat.shtml'
    parse_page(url)

if __name__ == '__main__':
    main()

# #encoding: utf-8
#
# from bs4 import BeautifulSoup
#
# html="""
#
#
#
# """
# bs=BeautifulSoup(html,"lxml")
# print(bs.prettify())
#
# 1.获取所有tr标签
# trs=soup.find_all('tr')
# for tr in trs:
#     print(type(tr))
#     break
#     from bs4.element import Tag
#
# 2获取第2个tr标签
# tr=soup.find_all('tr',limit=2)[1]
#     print tr
# 3.获取所有class等于even的tr标签
#
# trs=soup.find_all('tr',attrs={'class':'even'})
# for tr in trs:
#     print(tr)
#     print('='*30)
#
#
# 4.将所有id等于test,class也等于test的a标签提取出来
# aList=soup.find_all('a',id='test',class_='test',"class":"test")
# for a in aList:
#     print(a)
#
#
# 5.获取所有a标签的href属性
# aList=soup.find_all('a')
# for a in aList:
#     href=a.attrs['href']
#     print(href)
#
# 6.获取所有的职位信息(纯文本)
# trs=soup.find_all('tr')[1:]
# movies=[]
# for tr in trs:
#     movie={ }
#     tds=tr.find_all("td")
#     title=tds[0]
#     print(title.string)
#     category=tds[1].string
#     nums=tds[2].string
#     city=tds[3].string
#     pubtime=tds[4].string
#     movie['title']=title
#     movie['category']=category
#     movie['nums']=nums
#     movie['city']=city
#     movie['pubtime']=pubtime
#     movies.append(movie)
#
#    info=list(tr.stripped_strings)
#    movie['title']=infos[0]
#    movie['category']=infos[1]
#    movie['nums']=infos[2]
#    movie['city']=infos[3]
#    movie['pubtime']=infos[4]
#    movies.append(movie)
#
# print (movies)

a=soup.find('a')
print(a)

tr=soup.find_all('tr')[1]
text=List(tr.strings)
print(text)

#encoding: urf-8

soup=BeautifulSoup(html,'lxml')

#1.获取所有tr标签
trs=soup.select("tr{}")


trs=soup.select("tr")
for tr in trs:
    print(type(tr))
    print("="*30)
    break

#2.获取第二个tr标签
tr=soup.select('tr')[1]
print(tr)

#3.获取所有class等于even的tr标签
trs=soup.select("tr[class='even]")
for tr in trs:
    print (tr)

aList=soup.select('a')
for a in aList:
    href=a['href']
    print(href)

trs=soup.select('tr')
for tr in trs:
    infos=list(tr.stripped_strings)
    print(infos)

#encoding: utf-8

from bs4 import BeautifulSoup

html="""
   """

from bs4.element import Tag
from bs4.element import NavigableString
from bs4.element import Comment
soup=BeautifulSoup(html,'lxml')
table soup.find('table')
print(type(div.children))

import requests
from bs4 import BeautifulSoup
import html5lib
from pyecharts import Bar
ALL_DATA = []

def parse_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
    }
    response = requests.get(url)
    text = response.content.decode('utf-8')
    # 需要用到html5lib解析器,去补全html标签
    soup = BeautifulSoup(text,'html5lib')
    conMidtab = soup.find('div',class_='conMidtab')
    tables = conMidtab.find_all('table')
    for table in tables:
        trs = table.find_all('tr')[2:]
        for index,tr in enumerate(trs):
            tds = tr.find_all('td')
            city_td = tds[0]
            if index == 0:
                city_td = tds[1]
            city = list(city_td.stripped_strings)[0]
            temp_td = tds[-2]
            temp = list(temp_td.stripped_strings)[0]
            # print({'city':city,'temp':int(temp)})
            ALL_DATA.append({'city':city,'temp':int(temp)})


def main():
    url_list = [
        'http://www.weather.com.cn/textFC/hb.shtml',
        'http://www.weather.com.cn/textFC/db.shtml',
        'http://www.weather.com.cn/textFC/hd.shtml',
        'http://www.weather.com.cn/textFC/hz.shtml',
        'http://www.weather.com.cn/textFC/hn.shtml',
        'http://www.weather.com.cn/textFC/xb.shtml',
        'http://www.weather.com.cn/textFC/xn.shtml',
        'http://www.weather.com.cn/textFC/gat.shtml',
    ]
    for url in url_list:
        parse_page(url)
    #按天气最低进行排序,并只取10个
    ALL_DATA.sort(key=lambda data:data['temp'])
    data = ALL_DATA[0:10]
    #分别取出所有城市和温度
    cities = list(map(lambda x:x['city'],data))
    temps = list(map(lambda x:x['temp'],data))

    chart = Bar("中国天气最低气温排行榜")
    chart.add('',cities,temps)
    chart.render('temperature.html')

if __name__ == '__main__':
    main()
#encoding: utf-8
#1.将目标网站上的页面抓取下来
import requests
from lxml import etree
headers={
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400",
    'Referer':'https://movie.douban.com/'
}
#2.将抓取下来的数据根据一定的规则进行提取

url='https://movie.douban.com/cinema/nowplaying/changsha/'
response=requests.get(url,headers=headers)
text=response.text

html=etree.HTML(text)
ul=html.xpath("//ul[@class='lists']")[0]
lis=ul.xpath("./li")
movies=[]
for li in lis:
    title=li.xpath("@data-title")[0]
    score=li.xpath("@data-score")[0]
    duration=li.xpath("@data-duration")[0]
    region=li.xpath("@data-region")[0]
    director=li.xpath("@data-director")[0]
    actors=li.xpath("@data-actors")[0]
    thumbnail=li.xpath(".//img/@src")
    movie={
        'title':title,
        'score':score,
        'duration':duration,
        'region':region,
        'director':director,
        'actors':actors,
        'thumbnail':thumbnail
    }

    movies.append(movie)
    print(movies)
    # print(etree.tostring(ul,encoding='utf-8').decode("utf-8"))
#response.`
#encoding: utf-8

from lxml import etree
import requests
BASE_DOMAIN='http://www.ygdy8.net'

url="http://www.ygdy8.net/html/gndy/dyzz/list_23_2.html"
headers={
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25"
}

def get_detail_urls(url):
    response=requests.get(url,headers=headers)

    text=response.text.encode('utf-8')
    # text=response.content.decode('gbk')
    html=etree.HTML(text)
    detail_urls=html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls=map(lambda url:BASE_DOMAIN+url,detail_urls)
    return detail_urls

def parse_detail_page(url):
    movie={}
    response=requests.get(url,headers=headers)
    text=response.content.decode('gbk')
    html=etree.HTML(text)
    title=html.xpath("//vis[@class=title_all']//font[@color='#07519a']/text()")[0]
    movie['title']=title
    zoomE=html.xpath("//div[@id='Zoom']")[0]
    imgs=zoomE.xpath(".//img/@src")
    cover=imgs[0]
    screenshot=imgs[1]
    movie['cover']=cover
    movie['screenshot']=screenshot

    def parse_info(info,rule):
        return info.replace(rule,"").strip()

    infos=zoomE.xpath(".//text()")
    for index,info in enumerate(infos):
        if info.startswith(""):
            info=info.replace("").strip()
            movie['year']=info
        elif info.startwith(""):
            info=info.replace(" ").strip()
            movie['country']=info
        elif info.startwith("")
            info=info.replace(" ","").strip()
            movie['category']=info
        elif info.startwith("")
            info=parse_info(info," ")
            movie['douban_rating']=info
        elif info.startwith(info,"")
            info=parse_info(info,"")
            movie['duration']=info
        elif info.startwith(info,"")
            info=parse_info(info,"")
            movie['director']=info
        elif info.startwith("")
            info=parse_info(info,"")
            actors=[info]
            for x in range(index+1,len(infos)):
                actor=infos[x].strip()
                if actor.startswith(""):
                    break
                actors.append(actor)
            movie['actors']=actors
        elif info.startwith("")
            info=parse_info(info,)
            for x in range(index+1,len(infos))
                proflie=infos[x].strip()
                movie["profile"]=profile
    download_url=html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
    movie['download_url']=download_url
    return movie


def spider():
    base_url="http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
    movies=[]
    for x in range(1,8):
        # 第一个for循环,是用来控制总共有7页的
        url=base_url.format(x)
        detail_urls=get_detail_urls(url)
        for detail_url in detail_urls:
            #第二个for循环,是用来遍历一页中所有电影的详情url
            movie=parse_detail_page(detail_url)
            movies.append(movie)
    print(movie)
if __name__ == '__main__':
    spider()
# #encoding: utf-8
# import requests
#
# responce=requests.get("http://www.baidu.com/")
# print(type(responce.content))
# print(responce.content.decode('utf-8'))
#
#
# print(responce.url)
# print(responce.encoding)
# print(responce.status_code)



#encoding: utf-8
import requests
params={
    'wd':'中国'
}
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400'

}

responce=requests.get("https://www.baidu.com/s",params=params,headers=headers)

with open('baidu.html','w',encoding='utf-8') as fp:
    fp.write(responce.content.decode('utf-8'))

print(responce.url)
#encoding utf-8

import requests

proxy={yu

    'http':'182.34.34.191:9999'
}
response=requests.get("http://httpbin.org/ip",proxies=proxy)

print(response.text)



parser=etree.HTMLParser()
html=etree.parse("tencent.html")

trs=html.xpath("//tr")
for tr in trs:
     print(etree.tostring()
        )

2.获取第2个tr标签
 tr=html.xpath("//tr[2]")[0]
 print(tr)

 3.获取所有class等于even的tr标签
trs=html.xpath("//tr[@class=='even]")
for tr in trs:
    print (tr)

4.获取所有a标签的href属性
as=html.xpath("//a/@href")
for a in aList:
    print("http://hr.tencent.com/"+a)

5.获取所有的职位信息(纯文本)
trs=html.xpath("//tr[positiion(]>1")
positions=[]
for tr in trs:
    href=tr.xpath(".//a")
    fullurl='http://hr.tencent.com/'+href
    title=tr.xpath(".td[1]//text()")[0]
    category=tr.xpath("./td[2]/text()")[0]
    print(category)
    u=tr.xpath("./td[3]/text()")[0]
    pubtime(pubtime)
    print(address)

    position={
        'url':fullurl,
        'title':title,
        'category':category,
        'nums':nums,
        'address':address,
        'pubtime':pubtime
    }
    positions.append(position)

    break





#encoding: utf-8

import requests

url="http://www.renren.com/PLogin.do"
data={"email":"970138079@qq.com",'password':"pythonspider"}
headers={
       'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400"
}

session=requests.Session()
session.post(url,data=data,headers=headers)
response=session.get('http://www.renren.com/880151247/profile')
with open('renren.html','w',encoding='utf-8') as fp:
    fp.write(response.text)
#encoding: utf-8
from urllib import parse
from urllib import request
#urlretrive函数的用法
#request.urlretrieve('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1559904206&di=80bc57b1e2a8b53b00ab9adc411923e9&imgtype=jpg&er=1&src=http%3A%2F%2Fb-ssl.duitang.com%2Fuploads%2Fitem%2F201706%2F30%2F20170630005238_ztsxW.jpeg','luban.jpg')

#urlencode函数的用法
# params={'name':'张三',"age":18,'greet':'hello world'}
#result=parse.urlencode(params)
#print(result)

#
# url='http://wwww.baidu.com/s'
# params={"wd":"刘德华"}
# qs=parse.urlencode(params)
# url=url+"?"+qs
# resp=request.urlopen(url)
# print(resp.read())
# #resp=request.urlopen(url)
# #print(resp.read())

params={'name':'张三',"age":18,'greet':'hello world'}
qs=parse.urlencode(params)
print(qs)
result=parse.parse_qs(qs)
print(result)

全部评论

相关推荐

点赞 评论 收藏
分享
点赞 收藏 评论
分享
牛客网
牛客企业服务