import requests,random,os,xlwt,math,time,re,pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
#下载动态界面
def get_dynamic_html2(site_url):
print('开始加载',site_url,'动态页面')
chrome_options = webdriver.ChromeOptions()
#ban sandbox
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
#use headless
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,chrome_options=chrome_options)
#print('dynamic laod web is', site_url)
driver.set_page_load_timeout(100)
driver.set_window_size(1920, 1080)
#driver.set_script_timeout(100)
try:
driver.get(site_url)
except Exception as e:
driver.execute_script('window.stop()') # 超出时间则不加载
print(e, 'dynamic web load timeout')
time.sleep(2)
fullpage_screenshot(driver, 8000)
data2 = driver.page_source
soup2 = BeautifulSoup(data2, 'html.parser')
try:
time.sleep(3)
driver.quit()
except:
pass
return soup2
#滚动
def fullpage_screenshot(driver,total_height):
total_width = driver.execute_script("return document.body.offsetWidth")
#total_height = driver.execute_script("return document.body.parentNode.scrollHeight")
#total_height = 50000
viewport_width = driver.execute_script("return document.body.clientWidth")
viewport_height = driver.execute_script("return window.innerHeight")
rectangles = []
i = 0
while i < total_height:
ii = 0
top_height = i + viewport_height
if top_height > total_height:
top_height = total_height
while ii < total_width:
top_width = ii + viewport_width
if top_width > total_width:
top_width = total_width
rectangles.append((ii, i, top_width, top_height))
ii = ii + viewport_width
i = i + viewport_height
previous = None
part = 0
for rectangle in rectangles:
if not previous is None:
driver.execute_script("window.scrollTo({0}, {1})".format(rectangle[0], rectangle[1]))
print("Scrolled To ({0},{1})".format(rectangle[0], rectangle[1]))
time.sleep(0.5)
file_name = "part_{0}.png".format(part)
print("Capturing {0} ...".format(file_name))
#driver.get_screenshot_as_file(file_name)
if rectangle[1] + viewport_height > total_height:
offset = (rectangle[0], total_height - viewport_height)
else:
offset = (rectangle[0], rectangle[1])
print("Adding to stitched image with offset ({0}, {1})".format(offset[0], offset[1]))
part = part + 1
previous = rectangle
print("Finishing chrome full page screenshot workaround...")
return True
if __name__ == '__main__':
soup_0 = get_dynamic_html2('https://s.1688.com/selloffer/offer_search.htm?keywords=%C5%B7%C3%C0%C5%AE%D7%B0&n=y&netType=1%2C11%2C16')
info_tag_list = soup_0.select('.sm-offer-item')
for info_tag in info_tag_list:
skc = info_tag.attrs['trace-obj_value']
a_tag_list = info_tag.select('a')
#print(a_tag_list)
img_tag = a_tag_list[0].select('img')[0]
shop_tag = a_tag_list[2]
desc = img_tag.attrs['alt']
url = 'https://detail.1688.com/offer/{0}.html'.format(skc)
shop_name = shop_tag.text
shop_url = shop_tag.attrs['href']
print(desc,url,shop_name,shop_url)
print(len(info_tag_list))