python爬取法院失信名单——数据分析师不懂黑科技

爬取涉及到客户交互以及异步加载的页面,需要模拟客户点击并等待响应的操作。可以借助自动化测试软件selenium。
firefox浏览器驱动geckdriver下载地址

tar -xvzf chromedriver_linux64.zip
chmod +x chromedriver
sudo mv chromedriver /usr/bin/

pip install selenium

爬取法院失信名单(python)

# coding=utf-8    

import datetime
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import sys
import os

# 打开chrome浏览器(需提前安装好chromedriver)
#----------------------------------------------------

browser = webdriver.Firefox()
# browser = webdriver.PhantomJS()
print("正在爬取...")
browser.get("http:\\www.baidu.com/")

elem = browser.find_element_by_name('wd')
elem.send_keys("全国法院被执行人信息查询 - 被执行人查询")
browser.find_element_by_xpath('//*[@id="su"]').click()

#initlize
data = pd.DataFrame()
print("正在爬取...")
soup = BeautifulSoup(browser.page_source, "lxml")
id_card = soup.find_all("span",attrs={"class":"op_trust_fl op_trust_papers"})
name = soup.find_all("span",attrs={"class":"op_trust_name"})
names = [x.get_text() for x in name]
id_cards = [x.get_text() for x in id_card]
tmp = pd.DataFrame({'names':names,'id_card':id_cards})
data = data.append(tmp)
data.to_csv("shixin.txt",mode='a',index=False)
time.sleep(1)
def scrapef():
    # netx  page
    browser.find_element_by_xpath('//p/span[@class="op_trust_page_next OP_LOG_BTN"]').click()
    time.sleep(1.5)
    print("正在爬取...")
    soup = BeautifulSoup(browser.page_source, "lxml")
    id_card = soup.find_all("span",attrs={"class":"op_trust_fl op_trust_papers"})
    name = soup.find_all("span",attrs={"class":"op_trust_name"})
    names = [x.get_text() for x in name]
    id_cards = [x.get_text() for x in id_card]
    data = pd.DataFrame({'names':names,'id_card':id_cards})
    data.to_csv("shixin.txt",mode='a',index=False)
    print(data)

while(True):
    scrapef()

如果是R,因为没有类似绑定java或python的版本,需要下载单独selenium软件,然后启动selenium服务。下载chrome或firefox驱动。

install.packages('Rselenium')

你可能感兴趣的:(python爬取法院失信名单——数据分析师不懂黑科技)