此文章仅供学习交流使用
在学习爬虫之前,把最近对于 VMware 的网络学习做个总结
接下来,步入正题!
User-agent: *
Disallow:/
User-agent: *
Disallow:
User-agent: *
Disallow: /private/
Disallow: /tmp/
User-agent: WebCrawler
Disallow:
User-agent: *
Disallow: /
from urllib.robotparser import RobotFileParser
rp = RobotFileParser('http://www.jianshu.com/robots.txt')
rp.read()
// 利用 can_fetch() 方法判断了网页是否可以被抓取
print(rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
print(rp.can_fetch('*', 'https://www.jianshu.com/p/c678ce575f4c'))
抓取bilibili首页
import requests
def get_one_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
def main():
url = 'https://www.bilibili.com'
html = get_one_page(url)
print(html)
main()
抓取猫眼电影排行
import requests
from pyquery import PyQuery as pq
url = 'https://www.zhihu.com/explore'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
html = requests.get(url, headers=headers).text
doc = pq(html)
items = doc('.explore-tab .feed-item').items()
for item in items:
question = item.find('h2').text()
author = item.find('.author-link-line').text()
answer = pq(item.find('.content').html()).text()
file = open('explore.txt', 'a', encoding='utf-8')
file.write('\n'.join([question, author, answer])) # type: ignore
file.write('\n' + '=' * 50 + '\n')
file.close()
import json
data = [{
'name': '张三',
'gender': '女',
'birthday': '1992-10-18'
}]
with open('data.json','w', encoding='utf-8') as file:
file.write(json.dumps(data, indent=2, ensure_ascii=False))
import csv
with open('data.csv', 'w') as csvfile:
fieldnames = ['id', 'name', 'age']
writer = csv.DictWriter(csvfile, delimiter=' ', fieldnames=fieldnames)
writer.writeheader()
writer.writerow({'id': '10001', 'name': 'Mike', 'age': 20})
writer.writerow({'id': '10002', 'name': 'Bob', 'age': 22})
writer.writerow({'id': '10003', 'name': 'Jordan', 'age': 21})
import pandas
df = pandas.read_csv('data.csv')
print(df)
import pymysql
db = pymysql.connect(host="localhost", user="root", password="[数据库密码]", port=3306, db="spiders")
cursor = db.cursor()
// 创建表
sql='CREATE TABLE IF NOT EXISTS students (id VARCHAR(255) NOT NULL, name VARCHAR(255) NOT NULL, age INT NOT NULL, PRIMARY KEY (id))'
cursor.execute(sql)
db.close()
try:
id = '20120001'
user = 'Bob'
age = 20
sql = 'INSERT INTO students(id, name, age) values(%s, %s, %s)'
cursor.execute(sql, (id, user, age))
db.commit()
except:
db.rollback()
from pyquery import PyQuery as pq
from urllib.parse import urlencode
import requests
base_url = "https://m.weibo.cn/api/container/getIndex?"
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2830678474',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
params = {
'type': 'uid',
'value': '2830678474',
'containerid': '1076032830678474',
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json):
if json:
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
weibo = {}
weibo['id'] = item.get('id')
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
if __name__ == '__main__':
for page in range(1, 11):
json = get_page(page)
results = parse_page(json)
for result in results:
print(result)
from hashlib import md5
from multiprocessing.pool import Pool
from urllib.parse import urlencode
import os
import requests
def get_page(offset):
params = {
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'cur_tab': '1',
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(params)
try:
response = requests.get(url)
if response.status_code == 200:
return response.json()
except requests.ConnectionError:
return None
def get_images(json):
if json.get('data'):
for item in json.get('data'):
title = item.get('title')
images = item.get('image_detail')
for image in images:
yield {
'image': image.get('url'),
'title': title
}
def save_image(item):
if not os.path.exists(item.get('title')):
os.mkdir(item.get('title'))
try:
response = requests.get(item.get('image'))
if response.status_code == 200:
file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print('Already Downloaded', file_path)
except requests.ConnectionError:
print('Failed to Save Image')
def main(offset):
json = get_page(offset)
for item in get_images(json):
print(item)
save_image(item)
GROUP_START = 1
GROUP_END = 20
if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()
但未运行成功
tesserocr download
import tesserocr
from PIL import Image
image = Image.open('Code.jpg')
result = tesserocr.image_to_text(image)
print(result)
import tesserocr
print(tesserocr.file_to_text('Code.jpg'))
另外,还有转灰度,二值化等操作!
使用 Selenium
配置 ChromeDriver
示例代码
示例代码
示例代码
urllib 与 urllib3 对比
# from urllib.error import URLError
# from urllib.request import ProxyHandler, build_opener
# proxy = '127.0.0.1:9743'
# proxy_handler = ProxyHandler({
# 'http': 'http://' + proxy,
# 'https': 'https://' + proxy
# })
# opener = build_opener(proxy_handler)
# try:
# response = opener.open('http://httpbin.org/get')
# print(response.read().decode('utf-8'))
# except URLError as e:
# print(e.reason)
import urllib3
proxy = urllib3.ProxyManager('http://127.0.0.1:11088', headers={'connection': 'keep-alive'})
resp = proxy.request('get', 'http://httpbin.org/ip')
print(resp.status)
print(resp.data)
ADSL
拨号代理TinyProxy
Charles
的使用mitmproxy
的使用mitmdump
爬取“得到” App电子书信息Appium
的基本使用Appium
爬取微信朋友圈Appium+mitmdump
爬取京东商品详情见官网
scrapy.Request的更多参数
scrapy.Request(url[,callback,method="GET",headers,body,cookies,meta,dont_filter=False])
callback
:表示当前的url的响应交给哪个函数
去处理meta
:实现数据在不同的解析函数中传递,meta默认带有部分数据
,比如下载延迟,请求深度等dont_filte
r:默认为False
,会过滤请求的url
地址,即请求过的url
地址不会继续被请求,对需要重复请求的url
地址可以把它设置为Ture
,比如贴吧的翻页请求,页面的数据总是在变化;start_urls
中的地址会被反复请求,否则程序不会启动method
:指定POST
或GET
请求headers
:接收一个字典,其中不包括cookies
cookies
:接收一个字典,专门放置cookies
body
: 接收json
字符串,为POST
的数据,发送payload_post
请求时使用meta参数的使用
meta的作用: meta可以实现数据在不同的解析函数中的传递在爬虫文件的parse方法中,提取详情页增加之前callback指定的parse_detail函数:
def parse(self,response):
...
yield scrapy.Request(detail_url, callback=self.parse detail,meta=("item":item))
...
def parse detail(self,response):
#获取之前传入的item
item = resposne.meta["item"]
特别注意
meta
参数是一个字典meta
字典中有一个固定的键 proxy
,表示代理ip
json
数据文章持续更新中…