主要使用到了ip池、模拟浏览器自动化检测、时间推送、数据库链接等技术
具体代码:
`# --coding:utf-8 --
from selenium import webdriver
import re
from _md5 import md5
import happybase
import datetime
import time
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import random
HBASE_HOST = Your host address
HBASE_PORT = Your port
HBASE_TABLE = Your table
host = HBASE_HOST
port = HBASE_PORT
table_name = HBASE_TABLE
connection = happybase.Connection(host=host, port=port, timeout=None, autoconnect=False)
def get_ip():
# set request ip
proxie = []
ipfile = open(“ipforautohome.txt”, encoding=‘utf-8’)
# read in txt
ipagent = ipfile.readlines()
for ip in ipagent:
print(ip)
proxie.append(ip.strip())
print(proxie)
# control next page
while True:
# random choice ip
ip = random.choice(proxie)
print("代理IP:", ip)
# add proxyType
print("代理类型:", ProxyType.MANUAL)
try:
# create proxy
proxy = Proxy({
# 代理方式
'proxyType': ProxyType.MANUAL,
# HTTP代理(即IP)
'httpProxy': ip
})
# create new proxy
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
#new desired for proxy
proxy.add_to_capabilities(desired_capabilities)
# create driver
driver1 = webdriver.Chrome(
desired_capabilities=desired_capabilities
)
return driver1
except Exception as e:
# requestIp error
print("代理IP请求失败" + ip)
def get_cont(url, chexing):
driver.get(url)
autohome_luntan_list = []
for index in range(1, 108):
autohome_luntan_dict = {}
try:
# car_id
autohome_luntan_dict['chexing_id'] = chexing
# title
autohome_luntan_dict['title'] = driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dt/a[1]' % index).text.strip()
# title_url
autohome_luntan_dict['title_url'] = driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dt/a' % index).get_attribute('href')
# titleid
titleUrl_result = re.split('/', driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dt/a' % index).get_attribute('href'))
autohome_luntan_dict['title_urlid1'] = titleUrl_result[5]
autohome_luntan_dict['title_urlid2'] = re.sub('.html', '', titleUrl_result[6])
# pub_writer
autohome_luntan_dict['author'] = driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dd[1]/a' % index).text.strip()
#writer_url
autohome_luntan_dict['author_url'] = driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dd[1]/a' % index).get_attribute('href')
#writerId
authorId_result = re.split('/', driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dd[1]/a' % index).get_attribute('href'))
autohome_luntan_dict['author_id'] = authorId_result[3]
# pub_date
autohome_luntan_dict['pub_date'] = driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dd[1]/span' % index).text.strip()
# 回复量
autohome_luntan_dict['hf_count'] = driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dd[2]/span[1]' % index).text.strip()
# 点击量
autohome_luntan_dict['dj_count'] = driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dd[2]/span[2]' % index).text.strip()
# 最后回复人
autohome_luntan_dict['last_people'] = driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dd[3]/a' % index).text
# 最后回复人链接
autohome_luntan_dict['last_peopleUrl'] = driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dd[3]/a' % index).get_attribute('href')
# 最后回复人id
peopleId_result = re.split('/', driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dd[3]/a' % index).get_attribute('href'))
autohome_luntan_dict['lastPeople_id'] = peopleId_result[3]
# 最后回复时间
autohome_luntan_dict['last_date'] = driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dd[3]/span' % index).text.strip()
# 时间判断,取60天内数据且不取当天数据,当获得第61天数据返回,作为函数外结束页面循环的比较条件
pub_date = driver.find_element_by_xpath(
'//*[@id="subcontent"]/dl[%d]/dd[1]/span' % index).text.strip()
before = format(datetime.datetime.now() + datetime.timedelta(days=-60), '%Y-%m-%d')
if pub_date == datetime.datetime.now().strftime('%Y-%m-%d'):
continue
elif pub_date >= before:
autohome_luntan_list.append(autohome_luntan_dict)
else:
return pub_date
except Exception:
continue
for cont in autohome_luntan_list:
print(cont)
write(cont)
def write(cont):
cont = cont
rowkey = datetime.datetime.now().strftime(’%Y%m%d’) + cont[‘title’] + cont[‘title_url’]
connection.open()
table = connection.table(table_name)
table.put(md5(rowkey.encode(‘utf-8’)).hexdigest(), {
“cf1:chexing_id”: cont[‘chexing_id’],
“cf1:title”: cont[‘title’],
“cf1:title_url”: cont[‘title_url’],
“cf1:title_urlid1”: cont[‘title_urlid1’],
“cf1:title_urlid2”: cont[‘title_urlid2’],
“cf1:author”: cont[‘author’],
“cf1:author_url”: cont[‘author_url’],
“cf1:author_id”: cont[‘author_id’],
“cf1:pub_date”: cont[‘pub_date’],
“cf1:hf_count”: cont[‘hf_count’],
“cf1:dj_count”: cont[‘dj_count’],
“cf1:last_people”: cont[‘last_people’],
“cf1:last_peopleUrl”: cont[‘last_peopleUrl’],
“cf1:lastPeople_id”: cont[‘lastPeople_id’],
“cf1:last_date”: cont[‘last_date’],
})
connection.close()
if name == ‘main’:
# 添加ip池
driver = get_ip()
# 本地ip
# driver = webdriver.Chrome()
# 车型文件
file = open(fp_path, 'r', encoding='utf-8')
wj_cont = file.readlines()
cx = []
for c in wj_cont:
new_chexing = c.split(',')
for nc in new_chexing:
cx.append(nc)
# 获得车型
for chexing in cx:
try:
index = 1
while True:
time.sleep(1.5)
url = "https://club.*******.com.cn/bbs/forum-c-%s-%d.html?orderby=dateline&qaType=-1" % (
chexing, index)
print("正在获取%s车型%d页数据" % (chexing, index))
pub_date = get_cont(url, chexing)
# 返回发布时间和前推第61天日期比较,满足条件结束循环
break_time = format(datetime.datetime.now() + datetime.timedelta(days=-61), '%Y-%m-%d')
if break_time == pub_date:
break
# 翻页
index += 1
except Exception:
print("当前车型爬虫执行完毕")
break
print("关闭驱动")
driver.quit()
`