目的:获取数据源
实现方式:通过selenium模拟浏览器操作,获取指定网页上的需求数据
# %pip install selenium -i https://pypi.tuna.tsinghua.edu.cn/simple
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service # 新增
from selenium.webdriver.common.by import By # 新增
from selenium.webdriver.common.keys import Keys
from lxml import etree
import csv
import pandas as pd
def get_onePage_info(driver):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(2)
page_text = driver.page_source
# 将源码数据加载到对象中
tree = etree.HTML(page_text)
# 调用etree对象中的xpath方法结合着xpath表达式实现标签的定位和内容的捕获
# 1.定位
wins_list = tree.xpath("//ul[@class='cha_li']/li")
# 2.遍历取值
wins_infos = []
for win in wins_list:
win_name = ''.join(win.xpath('.//a/text()')) # 中标公告名
win_page = ''.join(win.xpath('.//a/@href')) #公告地址
add = ''.join(win.xpath(".//span[@class='ua-2']/text()"))
date = ''.join(win.xpath(".//span[@class='ua-3']/text()"))
wins_info = [win_name, win_page, add, date]
wins_infos.append(wins_info)
return wins_infos
def main():
service = Service(executable_path=r"D:\DataScience\Pro_Review\get data\chromedriver.exe")
# 创建Chrome实例
driver = webdriver.Chrome(service=service)
url = 'http://www.dav01.com/project/bid/list.html'
driver.get(url)
driver.maximize_window()
driver.find_element(By.LINK_TEXT, 'LED大屏').click()
windows = driver.window_handles # 是获取当前的所有窗口
driver.switch_to.window(windows[-1])
time.sleep(2)
all_wins_infos = []
for i in range(0, 10):
all_wins_infos += get_onePage_info(driver)
print('爬取第' + str(i+1) + '页成功')
driver.find_element(By.CLASS_NAME, 'pagf').click() # 点击下一页
time.sleep(2)
with open('LED大屏中标信息.csv', 'w', encoding='utf-8')as fp:
writer = csv.writer(fp)
writer.writerow(['中标公告', '公告地址', '区域', '公示日期'])
writer.writerows(all_wins_infos)
if __name__ == '__main__':
main()
爬取第1页成功
爬取第2页成功
爬取第3页成功
爬取第4页成功
爬取第5页成功
爬取第6页成功
爬取第7页成功
爬取第8页成功
爬取第9页成功
爬取第10页成功