知识点:
高人:selenium+headless chrome爬虫
爬虫的代码有一点需要注意,需要操作事件的时候最好不要直接用相应的方法,比如click。最好嵌入js脚本的方式进行调用。因为爬虫的代码执行速度很快,前端元素结构往往反应不过来,从而找出元素不可见或者不存在的错误。
province_items = DRIVER.find_element_by_class_name("city-province").find_elements_by_tag_name("a")
#province_item.click() #前端加载不赢
DRIVER.execute_script('arguments[0].click();',province_item)
#coding=utf-8
'''
1、抓取索引页内容
2、抓取详情页内容
3、下载图片保存数据库
4、循环及多线程
'''
import requests
from requests.exceptions import RequestException
from json import loads
from bs4 import BeautifulSoup
user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
headers = {"User-Agent": user_agent}
def get_onepage_index(i,keywords):
data={
"offset":i,
"format":"json",
"keyword":keywords,
"autoload":"true",
"count":"20",
"cur_tab":"1",
"from":"search_tab"
}
url='https://www.toutiao.com/search_content/?'
try:
response=requests.get(url,params=data)
if response.status_code==200:
return response.text
return None
except RequestException:
print('something is wrong!')
return None
def parse_onepage_index(html):
# json.loads()用于将str类型的数据转成dict。
data=loads(html)
if data and 'data' in data.keys(): ##获取所有的key 值
for item in data.get('data'): #get() 函数返回指定键的值,如果值不在字典中返回默认值。
yield item.get('article_url')
def get_page_detail(url):
try:
response=requests.get(url,headers=headers)
if response.status_code==200:
# print(response.status_code)
return response.text
return None
except RequestException:
print('wrong url:',url)
return None
def parsepage(html):
soup=BeautifulSoup(html,'lxml')
title=soup.title.string
print(title)
def main():
for i in range(1,2):
i=str(i*20)
html=get_onepage_index(i,'街拍')
for url in parse_onepage_index(html):
print(url)
detailhtml=get_page_detail(url) #返回网页文本
# print(detailhtml)
if detailhtml== None:
pass
else:
parsepage(detailhtml) #bs4去解析
# get_page_detail('http://toutiao.com/group/6596305324645286404/')
if __name__ == '__main__':
main()
# -*- coding: utf-8 -*-
'''
目标站点分析
网页结构分析
--开干--
1、单页内容
2、正则
3、保存json
4、多线程循环
'''
# .*具有贪婪的性质,首先匹配到不能匹配为止,根据后面的正则表达式,会进行回溯。
# .*?(短)则相反,一个匹配以后,就往下进行,所以不会进行回溯,具有最小匹配的性质。
# re.S 让.匹配换行符
--------------------------------
import json
import requests
from requests.exceptions import RequestException
import re
import time
from multiprocessing import Pool
headers = { #非常重要
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
'Connection': 'keep-alive',
'Referer': 'http://maoyan.com/board/6'
}
def get_one_page(url):
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None #非200
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name">(.*?).*?star">(.*?).*?releasetime">(.*?)'
+ '.*?integer">(.*?).*?fraction">(.*?).*? ', re.S)
items = re.findall(pattern, html)
for item in items:
yield { #变成生成器
'index': item[0],
'image': item[1],
'title': item[2],
'actor': item[3].strip()[3:], #字符串处理
'time': item[4].strip()[5:],
'score': item[5] + item[6] #分开匹配加起来
}
def write_to_file(content):
with open('result.txt', 'a', encoding='utf-8') as f: #编码
f.write(json.dumps(content, ensure_ascii=False) + '\n') #json.dumps 序列化时对中文默认使用的ascii编码
def main(offset):
url = 'http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url) #return返回参数
print(html)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(10):
main(offset=i * 10)
time.sleep(1)
# 进程池
# pool=Pool()
# pool.map(main,[i*10 for i in range(10)])
函数分2部分。主要文件main.py和配置MongoDB文件config.py
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
import time
import re
from configs import *
import pymongo
'''
1、搜索关键词
2、分析页码并翻页
3、分析提取数据
4、存到数据库
'''
#mongo数据库
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]
#headless chrome无界面模式
# from selenium.webdriver.chrome.options import Options
# global browser
# chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# browser = webdriver.Chrome(chrome_options=chrome_options)
# browser.set_window_size(1400, 900)
# wait = WebDriverWait(browser, 10)
#有界面模式,掉起浏览器便于观察
global browser
browser = webdriver.Chrome()
browser.set_window_size(1400, 900)
wait = WebDriverWait(browser, 10)
#打开函数
def search():
try:
#百度
# browser.get("https://www.baidu.com")
# input=browser.find_element_by_id("kw")
# input.send_keys("Python")
# input.send_keys(Keys.ENTER)
# wait=WebDriverWait(browser,10)
# wait.until(EC.presence_of_element_located((By.ID,"content_left")))
# print(browser.current_url)
# print(browser.get_cookies())
# print(browser.page_source)
#淘宝
browser.get("http://www.taobao.com")
#1、获取按钮
input = wait.until(
EC.presence_of_element_located((By.ID, "q")) )
submit = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button")) )
#2、模拟操作
input.send_keys('美食')
submit.click()
#等待加载挖完成再提取总页数
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))
).text
# print(total.text) # '100天'提取数字
totalnum=int(re.search(r'\d+',total).group())
return totalnum
except TimeoutError:
search()
#自动翻页
def next_page(page_number):
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
submit = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
input.clear() #清除内容先
input.send_keys(page_number)
submit.click()
#判断百度首页上,“糯米”按钮这个元素中存在文本:糯米,判断翻页是否成功
wait.until(
EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
time.sleep(0.1)
getproducts()
except TimeoutError:
next_page(page_number)
#数据提取
def getproducts():
wait.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
html=browser.page_source
doc=pq(html)
#items方法得到所有选择内容
items=doc('#mainsrp-itemlist .items .item').items()
for item in items:
product={
# 'image':item.find('.pic .img').attr('src'),
'name': item.find('.pic .img').attr('alt'),
'price':item.find('.price').text(),
'desl':item.find('.deal-cnt').text()[:-2],
'shop':item.find(('.shop')).text(),
'location':item.find('.location').text()
}
print(product)
# save_to_mongo(product)
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('存储成功',result)
except Exception:
print('存储失败')
#主函数
def main():
try:
totalnum=search()
# print(totalnum)
for i in range(2,totalnum+1):
# for i in range(2, 3):
next_page(i)
except Exception:
print('出错啊')
finally:
browser.close()
if __name__=='__main__':
main()
前提启动MongoDB数据库
MONGO_URL='localhost'
MONGO_DB='taobao'
MONGO_TABLE='table'