爬虫学习小记
1.cnblogs新闻页的爬取
- 关于整个学习过程,选用的是scrapy和selenium,其方便性很适用这个代码比较多的项目,为后期修改维护提供了便利。新闻页的爬取罗列一下知识点:
- 正则表达式
- Xpath基础语法
- CSS选择器
- yield方法
- items定义和使用
- pipeline
- itemloader
2.通过以上知识,能完成页面元素提取,通过分析网页,能够轻松获取页面内容,后期的修改也很方便。同时能进行图片提取。通过pipeline,使用异步方式入库,进行数据保存。
3.附常见问题:数据插入主键冲突、大规模抓取图片下载出错。
附代码:
from urllib import parse
import scrapy
from scrapy import Request
import requests
import re
import json
from ArticleSpider.items import JobBoleArticleItem
from ArticleSpider.utils import common
from scrapy.loader import ItemLoader
from ArticleSpider.items import ArticleItemLoader
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['news.cnblogs.com']
start_urls = ['http://news.cnblogs.com/']
def parse(self, response):
post_nodes = response.css('#news_list .news_block')[3:4]
for post_node in post_nodes:
image_url = post_node.css('.entry_summary a img::attr(src)').extract_first("")
post_url = post_node.css('h2 a::attr(href)').extract_first("")
yield Request(url=parse.urljoin(response.url,post_url),meta={"front_image_url":image_url},callback=self.parse_detail)
# # 提取下一页,scrapy下载
# # next_url = response.css("div.pager a:last-child::text").extract_first("")
# next_url = response.xpath("//a[contains(text(),'Next >')]/@href").extract_first("")
# # if next_url == "Next >":
# # next_url = response.css("div.pager a:last-child::text").extract_first("")
# yield Request(url=parse.urljoin(response.url,next_url),callback=self.parse)
def parse_detail(self,response):
match_re = re.match(".*?(\d+)",response.url)
if match_re:
post_id = match_re.group(1)
# article_item = JobBoleArticleItem()
# title = response.css("#news_title a::text").extract_first("")
# # title = response.xpath('//*[@id="news_title"]//a/text()')
# create_date = response.css("#news_info .time::text").extract_first("")
# match_re = re.match(".*?(\d+.*)",create_date)
# if match_re:
# create_date = match_re.group(1)
# # create_date = response.xpath('//*[@id="news_info"]//*[@class="time"]/text()')
# content = response.css("#news_content").extract_first()
# # content = response.xpath('//*[@id="news_content"]').extract()[0]
# tag_list = response.css(".news_tags a::text").extract()
# # tag_list = response.xpath('//*[@class="news_tags"]//a/text()').extract()
# tags = ",".join(tag_list)
#
#
#
# # html = requests.get(parse.urljoin(response.url,"/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
# # j_data = json.loads(html.text)
# article_item["title"] = title
# article_item["create_date"] = create_date
# article_item["content"] = content
# article_item["tags"] = tags
# article_item["url"] = response.url
#
# if response.meta.get("front_image_url", ""):
# article_item["front_image_url"] = [response.meta.get("front_image_url", "")]
# else:
# article_item["front_image_url"] = []
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
item_loader.add_css("title", "#news_title a::text")
item_loader.add_css("create_date", "#news_info .time::text")
item_loader.add_css("content", "#news_content")
item_loader.add_css("tags", ".news_tags a::text")
item_loader.add_value("url", response.url)
# if response.meta.get("front_image_url", []):
item_loader.add_value("front_image_url", response.meta.get("front_image_url", []))
# article_item = item_loader.load_item()
# print(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))
yield Request(url=parse.urljoin(response.url,"/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),
meta={"article_item":item_loader, "url": response.url},callback=self.parse_nums)
# praise_nums = j_data["DiggCount"]
# fav_nums = j_data["TotalView"]
# comment_nums = j_data["CommentCount"]
pass
def parse_nums(self,response):
j_data = json.loads(response.text)
item_loader = response.meta.get("article_item","")
praise_nums = j_data["DiggCount"]
fav_nums = j_data["TotalView"]
comment_nums = j_data["CommentCount"]
# article_item["praise_nums"] = praise_nums
# article_item["fav_nums"] = fav_nums
# article_item["comment_nums"] = comment_nums
# article_item["url_object_id"] = common.get_md5(article_item["url"])
item_loader.add_value("praise_nums", j_data["DiggCount"])
item_loader.add_value("fav_nums", j_data["TotalView"])
item_loader.add_value("comment_nums", j_data["CommentCount"])
item_loader.add_value("url_object_id", common.get_md5(response.meta.get("url", "")))
article_item = item_loader.load_item()
yield article_item
pass
2.知乎爬虫
倒立文字与英文验证码识别突破。
(PS:倒立文字识别参考知乎zheye文档,详情参见GITHUB:https://github.com/996refuse/zheye
英文数字识别参考灰度识别或依赖于打码API文档(推荐,方便好用,准确度高)
)
附代码:
import re
import json
import datetime
from selenium import webdriver
import time
import scrapy
import pickle
from mouse import move,click
from scrapy.loader import ItemLoader
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from ArticleSpider.zheye import zheye
import base64
class ZhihuSpider(scrapy.Spider):
name = "zhihu"
allowed_domains = ["www.zhihu.com"]
start_urls = ['https://www.zhihu.com/']
def parse(self, response):
pass
def start_requests(self):
chrome_option = Options()
chrome_option.add_argument("--disable-extensions")
chrome_option.add_experimental_option("debuggerAddress","127.0.0.1:9222")
browser = webdriver.Chrome(executable_path="D:/linkfoxdownload/chromedriver_win32/chromedriver.exe",
chrome_options=chrome_option)
try:
browser.maximize_window()
except:
pass
browser.get("https://www.zhihu.com/signin")
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(Keys.CONTROL + "a")
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("18672536828")
browser.find_element_by_css_selector(".SignFlow-password input").send_keys(Keys.CONTROL + "a")
browser.find_element_by_css_selector(".SignFlow-password input").send_keys("focusonme221221")
browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
time.sleep(10)
login_success = False
while not login_success:
try:
notify_ele = browser.find_element_by_class_name("AppHeader-userInfo")
login_success = True
except:
pass
try:
chinese_captcha_element = browser.find_element_by_class_name("Captcha-chineseImg")
except:
chinese_captcha_element = None
try:
english_captcha_element =browser.find_element_by_class_name("Captcha-englishImg")
except:
english_captcha_element = None
if chinese_captcha_element:
ele_position= chinese_captcha_element.location
x_relative = ele_position["x"]
y_relative = ele_position["y"]
browser_nevigation_panel_height = browser.execute_script(
'return window.outerHeight - window.innerHeight;'
)
base64_text = chinese_captcha_element.get_attribute("src")
code = base64_text.replace("data:image/jpg;base64,","").replace("%0A","")
fh = open("yzm_cn.jpeg","wb")
fh.write(base64.b64decode(code))
fh.close()
z = zheye()
positions = z.Recognize('yzm_cn.jpeg')
last_position = []
if len(positions) == 2:
if positions[0][1] > positions[1][1]:
last_position.append([positions[1][1], positions[1][0]])
last_position.append([positions[0][1], positions[0][0]])
else:
last_position.append([positions[0][1], positions[0][0]])
last_position.append([positions[1][1], positions[1][0]])
first_position = [int(last_position[0][0] / 2), int(last_position[0][1] / 2)]
second_position = [int(last_position[1][0] / 2), int(last_position[1][1] / 2)]
move(x_relative + first_position[0],y_relative + browser_nevigation_panel_height + first_position[1])
click()
time.sleep(3)
move(x_relative + second_position[0],
y_relative + browser_nevigation_panel_height + second_position[1])
click()
else:
last_position.append([positions[0][1], positions[0][0]])
first_position = [int(last_position[0][0] / 2), int(last_position[0][1] / 2)]
move(x_relative + first_position[0],
y_relative + browser_nevigation_panel_height + first_position[1])
click()
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(
Keys.CONTROL + "a")
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("18672536828")
browser.find_element_by_css_selector(".SignFlow-password input").send_keys(Keys.CONTROL + "a")
browser.find_element_by_css_selector(".SignFlow-password input").send_keys("focusonme221221")
time.sleep(3)
browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
if english_captcha_element:
ele_position = english_captcha_element.location
x_relative = ele_position["x"]
y_relative = ele_position["y"]
browser_nevigation_panel_height = browser.execute_script(
'return window.outerHeight - window.innerHeight;'
)
base64_text = english_captcha_element.get_attribute("src")
code = base64_text.replace("data:image/jpg;base64,","").replace("%0A", "")
fh = open("yzm_cn.jpeg", "wb")
fh.write(base64.b64decode(code))
fh.close()
from ArticleSpider.tools.chaorendm import dcVerCode
chaorendm = dcVerCode("1165224769","ljq123","3696")
code = chaorendm.recYZM("yzm_cn.jpeg")[0]
while True:
if code == "":
code = chaorendm.recYZM("yzm_cn.jpeg")[0]
else:
break
# from ArticleSpider.tools.yundama_requests import YDMHttp
# yundama= YDMHttp("da_ge_dal","degeda",3129,"40d5ad41c047179fc797631e3b9c3025")
# code = yundama.decode("yzm_cn.jpeg",5000,60)
# while True:
# if code == "":
# code = yundama.decode("yzm_cn.jpeg",5000,60)
# else:
# break
browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div/div[1]/div/form/div[4]/div/div/label').send_keys(Keys.CONTROL + "a")
browser.find_element_by_xpath(
'//*[@id="root"]/div/main/div/div/div/div[1]/div/form/div[4]/div/div/label').send_keys(code)
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(
Keys.CONTROL + "a")
browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(
"18672536828")
browser.find_element_by_css_selector(".SignFlow-password input").send_keys(Keys.CONTROL + "a")
browser.find_element_by_css_selector(".SignFlow-password input").send_keys("focusonme221221")
time.sleep(3)
browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
time.sleep(60)
# def start_requests(self):
# cookies = pickle.load(open("C:/Users/dell/ArticleSpider/ArticleSpider/cookies/zhihu.cookie","rb"))
# cookie_dict = {}
# for cookie in cookies:
# cookie_dict[cookie["name"]] = cookie["value"]
#
# return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict)]
#
# # chrome_option = Options()
# # chrome_option.add_argument("--disable-extensions")
# # chrome_option.add_experimental_option("debuggerAddress","127.0.0.1:9222")
# #
# #
# #
# # browser = webdriver.Chrome(executable_path="D:/linkfoxdownload/chromedriver_win32/chromedriver.exe",
# # chrome_options=chrome_option)
# # # browser.get("https://www.zhihu.com/signin")
# # #
# # # browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys(Keys.CONTROL + "a")
# # # browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input").send_keys("18672536828")
# # # browser.find_element_by_css_selector(".SignFlow-password input").send_keys(Keys.CONTROL + "a")
# # # browser.find_element_by_css_selector(".SignFlow-password input").send_keys("focusonme221221")
# # # time.sleep(3)
# # # move(897,492)
# # # click()
# # # browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
# #
# # browser.get("https://www.zhihu.com")
# # cookies = browser.get_cookies()
# #
# # pickle.dump(cookies,open("C:/Users/dell/ArticleSpider/ArticleSpider/cookies/zhihu.cookie","wb"))
# # cookie_dict = {}
# # for cookie in cookies:
# # cookie_dict[cookie["name"]] = cookie["value"]
# #
# # return [scrapy.Request(url=self.start_urls[0],dont_filter=True,cookies=cookie_dict)]
# #
# #
# #
# #
# # # time.sleep(60)
常见问题:坐标识别问题(横纵颠倒),推荐坐标识别工具。
附解决代码:
from ArticleSpider.zheye import zheye
z = zheye()
positions = z.Recognize('image/a.gif')
last_position = []
if len(positions) == 2:
if positions[0][1] > positions[1][1]:
last_position.append([positions[1][1], positions[1][0]])
last_position.append([positions[0][1], positions[0][0]])
else:
last_position.append([positions[0][1], positions[0][0]])
last_position.append([positions[1][1], positions[1][0]])
else:
last_position.append([positions[0][1], positions[0][0]])
print(last_position)