网页抓取保存到本地
savedata_Chrome_byurl.py
from selenium import webdriver
import time
import io
import csv
import pymysql
import os
import re
from lxml import etree
import codecs
def savepage(browser, filepath, pagename):
try:
if not os.path.exists(filepath):
os.mkdir(filepath)
textContent = browser.find_element_by_xpath('//html').get_attribute('outerHTML')
str_utf8 = textContent.encode("UTF-8")
textContent = str_utf8.decode('UTF-8', 'strict')
pagepath = filepath +'//'+ pagename + '.html'
fp = open(pagepath, "w", encoding='UTF-8');
fp.write(textContent);
fp.close()
except Exception as excpt:
print(excpt)
def getDbConn(db):
isonserver = True
osname = os.name
if osname == 'nt':
isonserver = False
print('windows')
else:
isonserver = True
print(os.name)
isonserver = False
if isonserver:
host = 'localhost'
user = 'root'
passwd = '123456'
else:
host = ''
user = ''
passwd = ''
# db = 'couponcategory'
port = 3306
conn = pymysql.connect(host=host, port=port, user=user, password=passwd, db=db)
return conn
def parse_data_page_step1(browser, url, urlname):
print('doing.......')
creditcard__items = browser.find_elements_by_xpath('//div[@class="creditcard__item"]')
for creditcard__item in creditcard__items:
try:
# title = creditcard__item.find_element_by_xpath('.//h2[@class="creditcard__heading"]').get_attribute('textContent')
article = creditcard__item.find_element_by_xpath('./article');
href = article.find_element_by_xpath('./div[@class="compare"]').find_element_by_xpath('./div[last()]/a').get_attribute('href')
# .get_attribute('href')
item = {}
item['url'] = url
item['url2'] = href
item['info0'] = urlname
# item['info1'] = title
print(urlname)
print(url)
print(href)
stu1 = [url, href, urlname, '']
out = open('fix10004.csv', 'a', newline='')
# out = open('d:/data_source10004_v1.csv', 'a', newline='')
# 设定写入模式
csv_write = csv.writer(out, dialect='excel')
# 写入具体内容
csv_write.writerow(stu1)
out.close()
except Exception as aas:
print(aas)
# print('write item.............................................')
# print(item)
# dbname = 'brcardsdata'
# dbtablename = 'data_source10004_url_v2'
# updateToDatabase(dbname, dbtablename, item)
# print('write item..............................................')
def get_key_url_map(dbname, tablename):
conn = getDbConn(dbname)
cursor = conn.cursor()
print("mysql connect success")
sql = "select url,pagecode from " + tablename
cursor.execute(sql)
dataresult = cursor.fetchall()
conn.close()
return dataresult
def scrapyStart1(browser, url, pagecode):
# 返回一个
# get_attribute('textContent')
# get_attribute('innerHTML')
# get_attribute('outerHTML')
print('4')
time.sleep(1)
print('6')
browser.get(url)
print('7')
time.sleep(5)
print('8')
try:
savepage(browser, '10004', pagecode)
except Exception as errr:
print('........currpage....error......................')
print(errr)
try:
targetElem = browser.find_element_by_xpath('//div[@class="pagehero__button"]')
browser.execute_script("arguments[0].focus();", targetElem)
time.sleep(0.5)
targetElem.click()
time.sleep(1.8)
print('8')
pagecode2 = pagecode + '_nextpage'
savepage(browser, '10004', pagecode2)
except Exception as eerr:
print('........nextpage....error......................')
print(eerr)
# re.sub(r'\?.*','',url)
browser = webdriver.Chrome()
time.sleep(0.5)
browser.maximize_window()
time.sleep(1)
key_url_map = get_key_url_map('pagedata', 'data_source10004_url')
# key_url_map = [['https://www.foregon.com/solicitar/cartaodecredito/agillitas/fgn/cartao-pre-pago-agillitas-mundo-livre-visa/1028','1']]
for key_url in key_url_map:
url = key_url[0]
pagecode = key_url[1]
pagecode = str(pagecode)
print(url)
scrapyStart1(browser, url, pagecode)
time.sleep(100)
browser.close()
parsepagedata.py
from selenium import webdriver
import time
import io
import csv
import pymysql
import os
import re
from lxml import etree
from bs4 import BeautifulSoup
import numpy as np
import codecs
def etreeWebElemToOuterHtml(webitem):
outerHtml = etree.tostring(webitem)
outerHtml = outerHtml.decode('utf-8')
return outerHtml
def trimDataHtmlProAndImg(divstr):
divstr = re.sub(r' href=".*?"', "", divstr)
divstr = re.sub(r' class=".*?"', "", divstr)
divstr = re.sub(r' target=".*?"', "", divstr)
divstr = re.sub(r' align=".*?"', "", divstr)
divstr = re.sub(r' rel=".*?"', "", divstr)
divstr = re.sub(r'
divstr = re.sub(r' data-cfemail=".*?"', "", divstr)
divstr = re.sub(r' id=".*?"', "", divstr)
divstr = re.sub(r' name=".*?"', "", divstr)
divstr = re.sub(r' style=".*?"', "", divstr)
divstr = re.sub(r' src=".*?"', "", divstr)
divstr = re.sub(r' dir=".*?"', "", divstr)
divstr = re.sub(r'
", divstr)
divstr = re.sub(r'', " ", divstr) divstr = re.sub(r'', "", divstr) divstr = re.sub(r' ', " ", divstr) divstr = re.sub(r' divstr = divstr.replace(' ') divstr = divstr.replace('', ' ') divstr = divstr.replace('
divstr = divstr.replace('', '
')return divstr
def loadpage(filepath, pagename):
try:
pagepath = filepath + '//' + pagename + '.html'
htmlf = open(pagepath,'r',encoding="utf-8")
htmlContent = htmlf.read()
return htmlContent
except Exception as excpt:
print(excpt)
return ''
def parseWithBeautifulSoup(htmlContent):
soup = BeautifulSoup(htmlContent, 'lxml')
mululist = soup.find_all(class_='mulu')
def parseWithXpath(htmlContent):
html = etree.HTML(htmlContent)
mululist = html.xpath('.//*[@class="mulu"]')
def getDbConn(db):
isonserver = True
osname = os.name
if osname == 'nt':
isonserver = False
print('windows')
else:
isonserver = True
print(os.name)
isonserver = False
if isonserver:
host = 'localhost'
user = 'root'
passwd = '123456'
else:
host = ''
user = ''
passwd = ''
port = 3306
conn = pymysql.connect(host=host, port=port, user=user, password=passwd, db=db)
return conn
def updateToDatabase(dbname, tablename, item):
url2 = item['url2']
updatevalue = {'url2': url2}
setsqllist = []
collist = ['info0', 'info1', 'info2', 'info3', 'info4', 'info5', 'info6', 'info7', 'info8', 'info9', 'info10', 'url']
for idx in range(len(collist)):
colname = collist[idx]
if colname in item:
if item[colname]:
updatevalue[colname] = item[colname]
setsqllist.append(colname + '=%(' + colname + ')s')
setsqllistlen = len(setsqllist)
if setsqllistlen > 0:
updatesql = 'update ' + tablename + ' set '
setsqlliststr = ','.join(setsqllist)
wherestr = ' where url2=%(url2)s'
updatesql = updatesql + setsqlliststr + wherestr
print(updatesql)
# print(updatevalue)
conn = getDbConn(dbname)
cursor = conn.cursor()
try:
cursor.execute(updatesql, updatevalue)
except Exception as e:
print('Insert Error1', e)
conn.rollback()
else:
conn.commit()
conn.close()
def parse_data_page_step1(htmlContent, pageid):
print('doing.......')
html = etree.HTML(htmlContent)
divcon = html.xpath('//div[@class="pagehero__content"]')[0]
str1 = divcon.xpath('./div[@class="pagehero__wrapper"]/h1[@class="pagehero__heading"]')[0].text
str2 = divcon.xpath('./div[@class="pagehero__wrapper"]/strong[@class="pagehero__description"]')[0].text
item = {}
item['url2'] = url
item['info1'] = str1
item['info8'] = str2
print('write item.............................................')
print(item)
# dbname = 'brcardsdata'
# dbtablename = 'data_source10004_url'
# updateToDatabase(dbname, dbtablename, item)
print('write item..............................................')
def parse_data_page_step2(htmlContent, pageid):
print('doing.......')
html = etree.HTML(htmlContent)
itemlist= html.xpath('//div[@class="box--list"]/div[@class="box--list-item"]')
info5 = ''
info6 = ''
info7 = ''
info10 = ''
for item in itemlist:
itemcon = item.xpath('./div[@class="box--container"]')[0]
str1 = itemcon.xpath('./div[@class="box--header"]/h3')[0].text
print(str1)
itemconbody = itemcon.xpath('./div[@class="box--body"]')[0]
str1 = str1.lower()
str1 = str1.strip()
# print(str1)
if str1 == 'online':
str2item = itemconbody.xpath('./div[contains(@class,"notsignedin")]')[0]
str2 = etreeWebElemToOuterHtml(str2item)
# print(str2)
str2 = trimDataHtmlProAndImg(str2)
info5 = '
' + str2 + '
'print('info5')
print(info5)
if str1 == 'no local':
str2item = itemconbody
str2 = etreeWebElemToOuterHtml(str2item)
str2 = trimDataHtmlProAndImg(str2)
info6 = '
' + str2 + '
'if str1 == 'por telefone':
str2item = itemconbody
str2 = etreeWebElemToOuterHtml(str2item)
str2 = trimDataHtmlProAndImg(str2 )
info7 = '
' + str2 + '
'if str1 == 'online':
try:
info10 = itemconbody.xpath('./div[contains(@class,"notsignedin")]').getAttributeValue('data-redirect') #申请链接
except Exception as exx:
print('....................errr1.......................')
print(exx)
try:
info10 = itemconbody.find_element_by_xpath('./div[contains(@class,"notsignedin")]/button').getAttributeValue('data-redirect') # 申请链接
except Exception as exx:
print('....................errr2.......................')
print(exx)
info10 = 'https://www.foregon.com' + info10
item = {}
item['url2'] = url
item['info5'] = info5
item['info6'] = info6
item['info7'] = info7
item['info10'] = info10
print('write item.............................................')
print(item)
# dbname = 'brcardsdata'
# dbtablename = 'data_source10004_url'
# updateToDatabase(dbname, dbtablename, item)
print('write item.................................................')
def get_key_url_map(dbname, tablename):
conn = getDbConn(dbname)
cursor = conn.cursor()
print("mysql connect success")
sql = "select url,pagecode from " + tablename
cursor.execute(sql)
dataresult = cursor.fetchall()
conn.close()
return dataresult
def scrapyStart1(url, pagecode):
htmlContent = loadpage('10004', pagecode)
parse_data_page_step1(htmlContent, pagecode)
pagecode2 = pagecode + '_nextpage'
htmlContent = loadpage('10004', pagecode2)
parse_data_page_step2(htmlContent, pagecode2)
# key_url_map = get_key_url_map('pagedata', 'data_source10004_url')
key_url_map = [['https://www.foregon.com/solicitar/cartaodecredito/agillitas/fgn/cartao-pre-pago-agillitas-mundo-livre-visa/1028','1']]
for key_url in key_url_map:
url = key_url[0]
pagecode = key_url[1]
pagecode = str(pagecode)
print(url)
scrapyStart1(url, pagecode)