网页抓取保存到本地和解析

网页抓取保存到本地

savedata_Chrome_byurl.py


from selenium import webdriver

import time

import io

import csv

import pymysql

import os

import re

from lxml import etree

import codecs

def savepage(browser, filepath, pagename):

    try:

      if not os.path.exists(filepath):

            os.mkdir(filepath)

        textContent = browser.find_element_by_xpath('//html').get_attribute('outerHTML')

        str_utf8 = textContent.encode("UTF-8")

        textContent = str_utf8.decode('UTF-8', 'strict')

        pagepath = filepath +'//'+ pagename + '.html'

        fp = open(pagepath, "w", encoding='UTF-8');

        fp.write(textContent);

        fp.close()

    except Exception as excpt:

        print(excpt)

def getDbConn(db):

    isonserver = True

    osname = os.name

    if osname == 'nt':

        isonserver = False

        print('windows')

    else:

        isonserver = True

        print(os.name)

    isonserver = False

    if isonserver:

        host = 'localhost'

        user = 'root'

        passwd = '123456'

    else:

        host = ''

        user = ''

        passwd = ''

    # db = 'couponcategory'

    port = 3306

    conn = pymysql.connect(host=host, port=port, user=user, password=passwd, db=db)

    return conn

def parse_data_page_step1(browser, url, urlname):

    print('doing.......')

    creditcard__items = browser.find_elements_by_xpath('//div[@class="creditcard__item"]')

    for creditcard__item in creditcard__items:

        try:

            # title = creditcard__item.find_element_by_xpath('.//h2[@class="creditcard__heading"]').get_attribute('textContent')

            article = creditcard__item.find_element_by_xpath('./article');

            href = article.find_element_by_xpath('./div[@class="compare"]').find_element_by_xpath('./div[last()]/a').get_attribute('href')

            # .get_attribute('href')

            item = {}

            item['url'] = url

            item['url2'] = href

            item['info0'] = urlname

            # item['info1'] = title

            print(urlname)

            print(url)

            print(href)

            stu1 = [url, href, urlname, '']

            out = open('fix10004.csv', 'a', newline='')

            # out = open('d:/data_source10004_v1.csv', 'a', newline='')

            # 设定写入模式

            csv_write = csv.writer(out, dialect='excel')

            # 写入具体内容

            csv_write.writerow(stu1)

            out.close()

        except Exception as aas:

            print(aas)

        # print('write item.............................................')

        # print(item)

        # dbname = 'brcardsdata'

        # dbtablename = 'data_source10004_url_v2'

        # updateToDatabase(dbname, dbtablename, item)

        # print('write item..............................................')

def get_key_url_map(dbname, tablename):

    conn = getDbConn(dbname)

    cursor = conn.cursor()

    print("mysql connect success")

    sql = "select url,pagecode from " + tablename

    cursor.execute(sql)

    dataresult = cursor.fetchall()

    conn.close()

    return dataresult

def scrapyStart1(browser, url, pagecode):

    # 返回一个

    # get_attribute('textContent')

    # get_attribute('innerHTML')

    # get_attribute('outerHTML')

    print('4')

    time.sleep(1)

    print('6')

    browser.get(url)

    print('7')

    time.sleep(5)

    print('8')

    try:

        savepage(browser, '10004', pagecode)

    except Exception as errr:

        print('........currpage....error......................')

        print(errr)

    try:

        targetElem = browser.find_element_by_xpath('//div[@class="pagehero__button"]')

        browser.execute_script("arguments[0].focus();", targetElem)

        time.sleep(0.5)

        targetElem.click()

        time.sleep(1.8)

        print('8')

        pagecode2 = pagecode + '_nextpage'

        savepage(browser, '10004', pagecode2)

    except Exception as eerr:

        print('........nextpage....error......................')

        print(eerr)

# re.sub(r'\?.*','',url)

browser = webdriver.Chrome()

time.sleep(0.5)

browser.maximize_window()

time.sleep(1)

key_url_map = get_key_url_map('pagedata', 'data_source10004_url')

# key_url_map = [['https://www.foregon.com/solicitar/cartaodecredito/agillitas/fgn/cartao-pre-pago-agillitas-mundo-livre-visa/1028','1']]

for key_url in key_url_map:

    url = key_url[0]

    pagecode = key_url[1]

    pagecode = str(pagecode)

    print(url)

    scrapyStart1(browser, url, pagecode)

time.sleep(100)

browser.close()


parsepagedata.py


from selenium import webdriver

import time

import io

import csv

import pymysql

import os

import re

from lxml import etree

from bs4 import BeautifulSoup

import numpy as np

import codecs

def etreeWebElemToOuterHtml(webitem):

    outerHtml = etree.tostring(webitem)

    outerHtml = outerHtml.decode('utf-8')

    return outerHtml

def trimDataHtmlProAndImg(divstr):

    divstr = re.sub(r' href=".*?"', "", divstr)

    divstr = re.sub(r' class=".*?"', "", divstr)

    divstr = re.sub(r' target=".*?"', "", divstr)

    divstr = re.sub(r' align=".*?"', "", divstr)

    divstr = re.sub(r' rel=".*?"', "", divstr)

    divstr = re.sub(r'', "", divstr)

    divstr = re.sub(r' data-cfemail=".*?"', "", divstr)

    divstr = re.sub(r' id=".*?"', "", divstr)

    divstr = re.sub(r' name=".*?"', "", divstr)

    divstr = re.sub(r' style=".*?"', "", divstr)

    divstr = re.sub(r' src=".*?"', "", divstr)

    divstr = re.sub(r' dir=".*?"', "", divstr)

    divstr = re.sub(r'

', "

", divstr)

    divstr = re.sub(r'', "

", divstr)

    divstr = re.sub(r'', "", divstr)

    divstr = re.sub(r'

', "

", divstr)

    divstr = re.sub(r'

你可能感兴趣的:(网页抓取保存到本地和解析)