Python3爬虫三大案例实战分享之Selenium+Chrome/Headless Chrome

知识点:

高人:selenium+headless chrome爬虫  

爬虫的代码有一点需要注意,需要操作事件的时候最好不要直接用相应的方法,比如click。最好嵌入js脚本的方式进行调用。因为爬虫的代码执行速度很快,前端元素结构往往反应不过来,从而找出元素不可见或者不存在的错误

province_items = DRIVER.find_element_by_class_name("city-province").find_elements_by_tag_name("a")
#province_item.click()  #前端加载不赢
DRIVER.execute_script('arguments[0].click();',province_item)

Python3爬虫三大案例实战

1、分析今日头条街拍美图

#coding=utf-8

'''
1、抓取索引页内容
2、抓取详情页内容
3、下载图片保存数据库
4、循环及多线程
'''

import requests
from requests.exceptions import RequestException
from json import loads
from bs4 import BeautifulSoup

user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
headers = {"User-Agent": user_agent}

def get_onepage_index(i,keywords):
    data={
    "offset":i,
    "format":"json",
    "keyword":keywords,
    "autoload":"true",
    "count":"20",
    "cur_tab":"1",
    "from":"search_tab"
    }
    url='https://www.toutiao.com/search_content/?'
    try:
        response=requests.get(url,params=data)
        if response.status_code==200:
            return response.text
        return None
    except RequestException:
        print('something is wrong!')
        return None

def parse_onepage_index(html):
    # json.loads()用于将str类型的数据转成dict。
    data=loads(html)
    if data and 'data' in data.keys():  ##获取所有的key 值
        for item in data.get('data'): #get() 函数返回指定键的值,如果值不在字典中返回默认值。
            yield item.get('article_url')

def get_page_detail(url):
    try:
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            # print(response.status_code)
            return response.text
        return None
    except RequestException:
        print('wrong url:',url)
        return None
def parsepage(html):
    soup=BeautifulSoup(html,'lxml')
    title=soup.title.string
    print(title)

def main():
    for i in range(1,2):
        i=str(i*20)
        html=get_onepage_index(i,'街拍')
        for url in parse_onepage_index(html):
            print(url)
            detailhtml=get_page_detail(url) #返回网页文本
            # print(detailhtml)
            if detailhtml== None:
                pass
            else:
                parsepage(detailhtml) #bs4去解析

# get_page_detail('http://toutiao.com/group/6596305324645286404/')

if __name__ == '__main__':
    main()

2、 课时2:Requests+正则表达式抓取猫眼电影最受期待榜

# -*- coding: utf-8 -*-
'''
目标站点分析
网页结构分析
--开干--
1、单页内容
2、正则
3、保存json
4、多线程循环
'''
# .*具有贪婪的性质,首先匹配到不能匹配为止,根据后面的正则表达式,会进行回溯。
# .*?(短)则相反,一个匹配以后,就往下进行,所以不会进行回溯,具有最小匹配的性质。
# re.S 让.匹配换行符
--------------------------------
import json
import requests
from requests.exceptions import RequestException
import re
import time
from multiprocessing import Pool

headers = {  #非常重要
    'Accept-Language': 'en-US,en;q=0.8',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
    'Connection': 'keep-alive',
    'Referer': 'http://maoyan.com/board/6'
}

def get_one_page(url):
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None #非200
    except RequestException:
        return None


def parse_one_page(html):
    pattern = re.compile('
.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name">(.*?).*?star">(.*?)

.*?releasetime">(.*?)

' + '.*?integer">(.*?).*?fraction">(.*?).*?
', re.S) items = re.findall(pattern, html) for item in items: yield { #变成生成器 'index': item[0], 'image': item[1], 'title': item[2], 'actor': item[3].strip()[3:], #字符串处理 'time': item[4].strip()[5:], 'score': item[5] + item[6] #分开匹配加起来 } def write_to_file(content): with open('result.txt', 'a', encoding='utf-8') as f: #编码 f.write(json.dumps(content, ensure_ascii=False) + '\n') #json.dumps 序列化时对中文默认使用的ascii编码 def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) #return返回参数 print(html) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': for i in range(10): main(offset=i * 10) time.sleep(1) # 进程池 # pool=Pool() # pool.map(main,[i*10 for i in range(10)])

 

3、Selenium+Chrome/PhantomJS抓取淘宝美食

函数分2部分。主要文件main.py和配置MongoDB文件config.py

1、main.py

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
import time
import re
from configs import *
import pymongo

'''
1、搜索关键词
2、分析页码并翻页
3、分析提取数据
4、存到数据库
'''

#mongo数据库
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]


#headless chrome无界面模式
# from selenium.webdriver.chrome.options import Options
# global browser
# chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# browser = webdriver.Chrome(chrome_options=chrome_options)
# browser.set_window_size(1400, 900)
# wait = WebDriverWait(browser, 10)


#有界面模式,掉起浏览器便于观察
global browser
browser = webdriver.Chrome()
browser.set_window_size(1400, 900)
wait = WebDriverWait(browser, 10)

#打开函数
def search():
    try:
        #百度
        # browser.get("https://www.baidu.com")
        # input=browser.find_element_by_id("kw")
        # input.send_keys("Python")
        # input.send_keys(Keys.ENTER)
        # wait=WebDriverWait(browser,10)
        # wait.until(EC.presence_of_element_located((By.ID,"content_left")))
        # print(browser.current_url)
        # print(browser.get_cookies())
        # print(browser.page_source)

        #淘宝
        browser.get("http://www.taobao.com")
        #1、获取按钮
        input = wait.until(
            EC.presence_of_element_located((By.ID, "q")) )
        submit = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button")) )
        #2、模拟操作
        input.send_keys('美食')
        submit.click()
        #等待加载挖完成再提取总页数
        total = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))
        ).text
        # print(total.text)  # '100天'提取数字
        totalnum=int(re.search(r'\d+',total).group())
        return totalnum
    except TimeoutError:
        search()

#自动翻页
def next_page(page_number):
    try:
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
        submit = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
        input.clear()  #清除内容先
        input.send_keys(page_number)
        submit.click()
        #判断百度首页上,“糯米”按钮这个元素中存在文本:糯米,判断翻页是否成功
        wait.until(
            EC.text_to_be_present_in_element(
                (By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
        time.sleep(0.1)
        getproducts()
    except TimeoutError:
        next_page(page_number)

#数据提取
def getproducts():
    wait.until(
        EC.presence_of_element_located(
            (By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
    html=browser.page_source
    doc=pq(html)
    #items方法得到所有选择内容
    items=doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product={
            # 'image':item.find('.pic .img').attr('src'),
            'name': item.find('.pic .img').attr('alt'),
            'price':item.find('.price').text(),
            'desl':item.find('.deal-cnt').text()[:-2],
            'shop':item.find(('.shop')).text(),
            'location':item.find('.location').text()
        }
        print(product)
        # save_to_mongo(product)
def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].insert(result):
            print('存储成功',result)
    except Exception:
        print('存储失败')

#主函数
def main():
    try:
        totalnum=search()
        # print(totalnum)
        for i in range(2,totalnum+1):
        # for i in range(2, 3):
            next_page(i)
    except Exception:
        print('出错啊')
    finally:
        browser.close()

if __name__=='__main__':
    main()

 

2、config.py

前提启动MongoDB数据库

MONGO_URL='localhost'
MONGO_DB='taobao'
MONGO_TABLE='table'

 

你可能感兴趣的:(编程语言)