爬虫技术:爬取今日头条数据-崔庆才思路
一. urllib库中将字典转化为url的查询参数
二.请求异常的处理,以及内部的判断逻辑
1.返回的json数据为空:原因是requests的请求对象没有加请求头和cookies
import requests from urllib.parse import urlencode def get_page_index(): data = { "aid": "24", "app_name": "web_search", "offset": "0", "format": "json", "keyword": "街拍", "autoload": "true", "count": "20", "en_qc": "1", "cur_tab": "1", "from": "search_tab", "pd": "synthesis", "timestamp": "1568883030289" } url = "https://www.toutiao.com/api/search/content/?" + urlencode(data) response = requests.get(url) if response.status_code == 200: print(response.text) if __name__ == '__main__': get_page_index()
# 结果:
{"count":0,"return_count":0,"query_id":"6537385837821170952","has_more":0,"request_id":"20190919170154010017090029827CF0A","search_id":"20190919170154010017090029827CF0A","cur_ts":1568883714,"offset":20,"message":"success","pd":"synthesis","show_tabs":1,"keyword":"街拍","city":"西安","log_pb":{"impr_id":"20190919170154010017090029827CF0A"},"data":null,"data_head":[{"challenge_code":1366,"cell_type":71,"keyword":"街拍","url":"sslocal://search?keyword=%E8%A1%97%E6%8B%8D\u0026from=\u0026source=search_tab"}],"ab_fields":null,"latency":0,"search_type":2,"tab_rank":null}
2.正常获得数据
import requests from urllib.parse import urlencode def get_page_index(): data = { "aid": "24", "app_name": "web_search", "offset": "0", "format": "json", "keyword": "街拍", "autoload": "true", "count": "20", "en_qc": "1", "cur_tab": "1", "from": "search_tab", "pd": "synthesis", "timestamp": "1568883030289" } url = "https://www.toutiao.com/api/search/content/?" + urlencode(data) response = requests.get(url,headers=headers,cookies=cookies) if response.status_code == 200: print(response.content.decode("utf-8")) if __name__ == '__main__': headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"} cookies = {"Cookie": "tt_webid=6719272225969096196; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6719272225969096196; csrftoken=b28e41c77cd4f268af393de7d3e9d47a; UM_distinctid=16c4159a9ae7e3-04be696c185f6c-3f385c06-1fa400-16c4159a9afa94; CNZZDATA1259612802=1303724616-1564459685-https%253A%252F%252Fwww.toutiao.com%252F%7C1564459685; WIN_WH=1536_710; s_v_web_id=e588fb5c6570d79a16b67e84decce3d8; __tasessionId=y99fyeyyt1568882979794"} get_page_index() # 结果: {"count":20,"return_count":20,"query_id":"6537385837821170952","has_more":1,"request_id":"20190919170856010017031149086E0FC","search_id":"20190919170856010017031149086E0FC","cur_ts":1568884136,"offset":20,"message":"success","pd":"synthesis","show_tabs":1,"keyword":"街拍","city":"西安","tokens":["街拍"],"log_pb":{"impr_id":"20190919170856010017031149086E0FC"},"data":[{"ala_src":"user","app_info":{"query_type":"AladdinRpcQueryType"},"cell_type。。。。。。。。。。。。省略
四:
图片地址位置定位:要现请求这个网址,获得相应解析出对应的imag_url
解析报错:SyntaxError: Non-UTF-8 code starting with '\xe5',在程序上方添加 # -*- coding:utf-8 -*-
json中的键值对,期望用双引号而不是单引号。原因是正则错误:
五:完整的代码
# -*- coding:utf-8 -*- import re import requests from urllib.parse import urlencode import os from requests.exceptions import RequestException import json import pymongo from bs4 import BeautifulSoup from config import * from hashlib import md5 # 建立数据库的链接对象 client = pymongo.MongoClient(MONGO_URL) # 数据库的名称 db = client[MONGO_DB] def get_page_index(offset, keyword): data = { "aid": "24", "app_name": "web_search", "offset": offset, "format": "json", "keyword": keyword, "autoload": "true", "count": "20", "en_qc": "1", "cur_tab": "1", "from": "search_tab", "pd": "synthesis", "timestamp": "1568883030289" } url = "https://www.toutiao.com/api/search/content/?" + urlencode(data) # 知识点1:urlencode()将字典数据,{"a":"1","b":"2"}----> a=1,b=2 try: response = requests.get(url, headers=headers, cookies=cookies) if response.status_code == 200: content = response.content.decode() return content return None except RequestException: # 知识点2:所有请求异常类的捕获 print("请求出错") return None def parse_page_index(html): """构造生成器即可,或者这个函数的返回值是一个列表""" data = json.loads(html) if data and "data" in data.keys(): for item in data.get("data"): # 知识点3:字典获取键的值的get方法 if "article_url" in item.keys(): url = item.get("article_url") yield url def get_page_detial(url): try: # 知识点4:请求的异常处理方式 response = requests.get(url, headers=headers, cookies=cookies) if response.status_code == 200: content = response.content.decode() return content return None except RequestException: print("请求出错") return None def parse_page_detial(html, url): """正则获取gallery""" soup = BeautifulSoup(html, "lxml") title = soup.select("title")[0].get_text() # 知识点5:soup的选择器使用 images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S) # 知识点6:正则模式re.S模式 result = re.search(images_pattern, html) if result: ret = result.group(1) # {\"count\":11,\"sub_images\":[{\"url\":\"http:\\\u002F\\\u002Fp3.pstatp.com\\...} # 在进行loads转换时,报错json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) # 因此需要替换\为空字符串 ret = ret.replace("\\", "") ret = ret.replace("u002F", "/") data = json.loads(ret) if data and 'sub_images' in data.keys(): sub_images = data.get("sub_images") images = [item.get("url") for item in sub_images] for img in images: download(img) return { "title": title, "images": images, "url": url } def save_to_mongo(ret_dict): if db[MONGO_TABLE].insert(ret_dict): # 知识点8:mongodb数据库的链接,配置文件方式传入 print("插入数据到数据库成功", ret_dict["title"]) return True return False def download(url): print("正在下载图片",url) try: response = requests.get(url, headers=headers, cookies=cookies) if response.status_code == 200: content = response.content saveimg(content) return None except RequestException: print("请求出错") return None def saveimg(content): file_path = "{0}/{1}.{2}".format(os.getcwd(),md5(content).hexdigest(),"jpg") # 知识点9:运用md5进行去重,md5的简单回顾 if not os.path.exists(file_path): # 知识点10:os方法的使用 with open(file_path,"wb") as f: f.write(content) def main(): for offset in range(START_PAGE,END_PAGE,20): keyword = "街拍" html = get_page_index(offset, keyword) if html: for url in parse_page_index(html): html = get_page_detial(url) if html: ret = parse_page_detial(html, url) if ret: save_to_mongo(ret) if __name__ == '__main__': headers = { "User-Agent": "xx"} cookies = { "Cookie": "xx"} main()
试运行爬取所有的街拍:报错json.decoder.JSONDecodeError,因此代码还得进行优化,排除异常。
六:知识点总结
urlencode是从urllib.parse中的一个方法:将字典变成url的查询参数
from urllib.parse import urlencode data = {"a":1,"b":2} url = "http:www.baidu.com/?" print(url + urlencode(data))
http:www.baidu.com/?a=1&b=2
md5加密的不一致问题
一直以来都是用 hashlib中的md5进行加密,md5.update(二进制) md5.hexdigest(),可以会出现对相同的字符串进行加密,加密结果不一样的问题,看来是update方法造成的。
from hashlib import md5 fp = md5() demo = ["1","1","3","3"] for i in demo: fp.update(i.encode("utf-8")) print(fp.hexdigest()) # 结果: c4ca4238a0b923820dcc509a6f75849b 6512bd43d9caa6e02c990b0a82652dca 73278a4a86960eeb576a8fd4c9ec6997 fd06b8ea02fe5b1c2496fe1700e9d16c
# 原因是md5.updage()会将上次的串和这次的进行拼接,1,11,113,1133,每次加密的串都不同,结果肯定不同。
所以每加密之前,都对md5进行实例化,才能保证相同内容加密结果一样,因为以前这个方法都是放在函数里面的,每次调用函数,都会重新实例化md5,因此不存在问题。循环就存在问题
上面代码可以改为
from hashlib import md5
demo = ["1","1","3","3"]
for i in demo:
fp = md5()
fp.update(i.encode("utf-8"))
print(fp.hexdigest())
# 结果为:
c4ca4238a0b923820dcc509a6f75849b
c4ca4238a0b923820dcc509a6f75849b
eccbc87e4b5ce2fe28308fd9f2a7baf3
eccbc87e4b5ce2fe28308fd9f2a7baf3
for i in demo: print(md5(i.encode("utf-8")).hexdigest()) # 这种方式行,因为每次都重新实例化了 # 结果 c4ca4238a0b923820dcc509a6f75849b c4ca4238a0b923820dcc509a6f75849b eccbc87e4b5ce2fe28308fd9f2a7baf3 eccbc87e4b5ce2fe28308fd9f2a7baf3
# 看源码也没有理解update真正意图,只是说用字符串更新对象。 后续解决
os模块的使用方法
os的基本用法 1. os.getcwd():查看当前所在路径。 current_path = os.getcwd() print(current_path) # 运行结果 C:\Users\Administrator\AppData\Roaming\Sublime Text 3\Packages\User 2. os.listdir(path):列举目录下的所有文件。返回的是列表类型。 dir_list = os.listdir(current) print(dir_list) # 运行结果 ['11.py', 'cuiqingcai.py', 'Localization.sublime-settings', 'oscrypto-ca-bundle.crt', 'Package Control.cache', 'Package Control.last-run', 'Package Control.merged-ca-bundle', 'Package Control.sublime-settings', 'Package Control.user-ca-bundle', 'Preferences.sublime-settings', 'reids分布式锁', 'sha1.py', 'test.py', 'untitled.sublime-build']
具体用法见:https://www.cnblogs.com/yufeihlf/p/6179547.html
Mongo数据库与python的交互
import pyongo # 交互模块 # 第一步,建立客户端,链接mogo服务器,ip和port from pymongo import MongoClient client = MongoClient(host,port) collection = client[db名][集合名] # db名--相当于数据库的名称 集合名---相当于表名称 # 第二步,添加数据 ret = collection.insert_one({"name":"test10010","age":33}) print(ret) # 通过返回的数据进行判断
if ret:
xxxx
示例:
import pymongo
client = pymongo.MongoClient("localhost")
# 链接指定数据库中的指定集合,不存在就新建
collection = client["test"]["new"]
ret = collection.insert({"new":"python"})
print(ret)
# 结果:
5d85ce978a808f42364b045c
插入前:
插入后:
正则表达式知识点回顾:
import re pattern = re.compile("匹配规则", re.S) re.compile() 返回的就是一个匹配规则。陪着search find match等方法使用 import re a = """aaaaaaabbbbbbbb 111111ccccc""" pattern1 = re.compile("aaaaaaa(.*?)cccc") print(re.search(pattern1,a)) # None re.S可以匹配全部文本,不担心换行问题 pattern2 = pattern1 = re.compile("aaaaaaa(.*?)cccc",re.S) print(re.search(pattern2,a)) #
posted on
2019-09-19 17:18 张京墨 阅读(
...) 评论(
...) 编辑 收藏