实现步骤

1、从民政数据网站中提取最新区划代码

# 特点

1、最新的在上面

# 代码实现

import requests

from lxml import etree

import re

 

url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

html = requests.get(url, headers=headers).text

parse_html = etree.HTML(html)

article_list = parse_html.xpath('//a[@class="artitlelist"]')

 

for article in article_list:

    title = article.xpath('./@title')[0]

    # 正则匹配title中包含这个字符串的链接

    if re.findall(r'.**?', title, re.S):

        # 获取到第1个就停止即可,第1个永远是最新的链接

        two_link = 'http://www.mca.gov.cn' + article.xpath('./@href')[0]

        print(two_link)

        break

从二级页面链接中提取真实链接(反爬-响应内容中嵌入JS,指向新的链接)

1、向二级页面链接发请求得到响应内容,并查看嵌入的JS代码

2、正则提取真实的二级页面链接

# 相关思路代码

two_html = requests.get(two_link, headers=headers).text

# 从二级页面的响应中提取真实的链接(此处为JS动态加载跳转的地址)

new_two_link = re.findall(r'window.location.href="(.*?)"', html2, re.S)[0]

3、在数据库表中查询此条链接是否已经爬取,建立增量爬虫

1、数据库中建立version表,存储爬取的链接

2、每次执行程序和version表中记录核对,查看是否已经爬取过

# 思路代码

cursor.execute('select * from version')

result = self.cursor.fetchall()

if result:

    if result[-1][0] == two_link:

        print('已是最新')

    else:

        # 有更新,开始抓取

        # 将链接再重新插入version表记录

代码实现

'''网站数据抓取(增量爬虫)'''

import requests

from lxml import etree

import re

import pymysql

 

class Govement(object):

    def __init__(self):

        self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'

        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}

        self.db = pymysql.connect('127.0.0.1','root','123456','govdb')

        self.cursor = self.db.cursor()

 

    # 获取二级页面真实链接,并和数据库中比对

    def get_two_link(self):

        html = requests.get(self.one_url,headers=self.headers).text

        # 此处隐藏了真实的二级页面的url链接,通过js脚本生成,保存本地文件查看

        parse_html = etree.HTML(html)

        a_list = parse_html.xpath('//a[@class="artitlelist"]')

        for a in a_list:

            title = a.xpath('./@title')[0]

            # 正则匹配title中包含这个字符串的链接

             # 利用正则找到第一个自己需要的title里面的地址(第一个一般都是最新的)

            if re.findall(r'.*?.*?',title,re.S):

                # 获取到第1个就停止即可,第1个永远是最新的链接

                two_link = 'http://www.mca.gov.cn' + a.xpath('./@href')[0]

                break

 

        # 从已提取的two_link中提取二级页面的真实链接

        two_html = requests.get(two_link, headers=self.headers).text

        # 从二级页面的响应中提取真实的链接(此处为JS动态加载跳转的地址)

        real_two_link = re.findall(r'window.location.href="(.*?)"', two_html, re.S)[0]

        # 实现增量爬取

        self.cursor.execute('select * from version')

        result = self.cursor.fetchall()

        if result:

            if result[-1][0] == real_two_link:

                print('已是最新')

        else:

            self.get_data(real_two_link)

            self.cursor.execute('insert into version values(%s)',[real_two_link])

            self.db.commit()

 

    # xpath直接提取数据

    def get_data(self,real_two_link):

        real_two_html = requests.get(real_two_link,headers=self.headers).text

        parse_html = etree.HTML(real_two_html)

        # 基准xpath,提取每个信息的节点列表对象

        tr_list = parse_html.xpath('//tr[@height=19]')

        city_info = {}

        for tr in tr_list:

            city_info['code'] = tr.xpath('./td[2]/text()')

            city_info['name'] = tr.xpath('./td[3]/text()')

            print(city_info)

 

if __name__ == '__main__':

    spider = Govement()

    spider.get_two_link()

# coding=utf-8  

from bs4 import BeautifulSoup  

from selenium import webdriver  

import time  

  

#使用selenium  

driver = webdriver.PhantomJS(executable_path="D:\\python\\phantomjs-2.1.1\\bin\\phantomjs.exe")  

driver.maximize_window()  

  

#登录QQ空间  

def get_shuoshuo(qq):  

    driver.get('http://user.qzone.qq.com/{}/311'.format(qq))  

    time.sleep(5)  

    try:  

        driver.find_element_by_id('login_div')  

        a = True  

        print("需要登录...")  

    except:  

        a = False  

        print("不需要登录...")  

        function(){ //亨达 http://www.hantecglobal.org.cn/

          

    if a == True:  

        driver.switch_to.frame('login_frame')  

        driver.find_element_by_id('switcher_plogin').click()  

        driver.find_element_by_id('u').clear()#选择用户名框  

        driver.find_element_by_id('u').send_keys('QQ号码')  

        driver.find_element_by_id('p').clear()  

        driver.find_element_by_id('p').send_keys('QQ密码')  

        driver.find_element_by_id('login_button').click()  

        time.sleep(3)  

    driver.implicitly_wait(3)  

      

    print("验证权限...")  

    try:  

        driver.find_element_by_id('QM_OwnerInfo_Icon')  

        b = True  

    except:  

        b = False  

          

    if b == True:  

        print("获取说说...")  

        driver.switch_to.frame('app_canvas_frame')  

        content = driver.find_elements_by_css_selector('.content')  

        stime = driver.find_elements_by_css_selector('.c_tx.c_tx3.goDetail')  

        for con,sti in zip(content,stime):  

            data = {  

                'time':sti.text,  

                'shuos':con.text  

            }  

            print(data)  

        pages = driver.page_source  

        #print(pages)  

        soup = BeautifulSoup(pages,'lxml')  

  

    cookie = driver.get_cookies()  

    cookie_dict = []  

    for c in cookie:  

        ck = "{0}={1};".format(c['name'],c['value'])  

        cookie_dict.append(ck)  

          

    i = ''  

    for c in cookie_dict:  

        i += c  

    print('Cookies:',i)  

    print("==========完成================")  

  

    driver.close()  

    driver.quit()  

  

if __name__ == '__main__':