python+selenium爬取链家网房源信息并保存至csv
python+selenium爬取链家网房源信息并保存至csv
抓取的信息有:房源’, ‘详细信息’, ‘价格’,‘楼层’, '有无电梯
import csv
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
def write2txt(line):
with open('租房.txt', 'a', encoding='utf-8') as f:
f.write(line + '\n')
def write_to_csv(row_data):
with open('data.csv', 'a+', newline="", encoding='utf-8') as f:
csv_add = csv.writer(f)
csv_add.writerow(row_data)
def process():
driver_path = r"D:\chromedriver.exe"
browser = webdriver.Chrome(executable_path=driver_path)
browser.implicitly_wait(1)
write_to_csv(['房源', '详细信息', '价格','楼层', '有无电梯'])
for page in range(1, 14):
if page == 1:
url = 'https://sh.lianjia.com/zufang/rs%E6%9D%BE%E6%B1%9F%E5%A4%A7%E5%AD%A6%E5%9F%8E/#contentList'
else:
url = 'https://sh.lianjia.com/zufang/pg' + str(page) +'rs松江大学城/#contentList'
browser.get(url)
browser.maximize_window()
wait = WebDriverWait(browser, 3)
div = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.content__list')))
div_list = div.find_elements_by_tag_name('div')
print(len(div_list))
list_page_handle = browser.current_window_handle
for n, div in enumerate(div_list):
detail_p_list = div.find_elements_by_css_selector('p')
print(n+1)
#title
title_a = detail_p_list[0].find_element_by_tag_name('a')
title = title_a.text
print('房源:',title)
a_list = detail_p_list[1].find_elements_by_tag_name('a')
detail_text = a_list[0].text
detail_text += a_list[1].text
detail_text += a_list[2].text
detail_text += detail_p_list[1].text
print('详细信息:',detail_text)
#price
price_span = div.find_element_by_css_selector('span > em').text
print('价格:',price_span)
#下拉滚动条
js = 'window.scrollTo(0, + ' + str((n+1) * 1000) + ')'
browser.execute_script(js)
title_a.click()
# 获取楼层和电梯信息
time.sleep(1)
all_handles = browser.window_handles
browser.switch_to.window(all_handles[-1])
li_list = browser.find_elements_by_css_selector('div.content__article__info > ul > li')
louceng = li_list[7].text
dianti = li_list[8].text
print(louceng + dianti)
write2txt(title + ',' + detail_text + ',' + price_span + ',' + louceng + ',' + dianti)
raw_data = [title, detail_text, price_span, louceng, dianti]
write_to_csv(raw_data)
detail_page_handle = browser.current_window_handle
browser.close()
browser.switch_to.window(list_page_handle)
if __name__ == '__main__':
s = time.time()
process()
e = time.time()
print('用时:'+ str(e-s))
欢迎关注我的微信公众号~