爬虫整理
最简单的案例
'''
from urllib import request
with request.urlopen("http://www.runoob.com") as f:
if f.status==200: #200 f.status返回状态码;f.reason:OK
data=f.read() # 读取返回的主体内容,赋值给data,此时数据格式为字节码
# print(data.decode())
print(f.getheaders()) # 读取返回的头信息,头信息格式为元组列表
for k,v in f.getheaders():
print(k,v)
try:
with open('first.html','w+') as fp:
fp.write(data.decode())
fp.close()
except Exception as e:
print(e)
'''
* 涉及知识点:
* 读取内容常见的有三种方式,
1. read()读取文件的全部内容,与readline()不同的是,read()会把读取到的内容赋给一个字符串常量。
2. readlines()读取文件的全部内容,readlines()会把读取到的内容赋给一个列表变量。
3. readline()读取文件的一行内容。
* 文件读写
1. w 写模式,它是不能读的,如果用w模式打开一个已经存在的文件,会清空以前的文件内容,重新写
w+ 是读写内容,只要沾上w,肯定会清空原来的文件
2. r 读模式,只能读,不能写,而且文件必须存在
r+ 是读写模式,只要沾上r,文件必须存在
3. a 追加模式,也能写,在文件的末尾添加内容
4. rb+、wb+、ab+,这种是二进制模式打开或者读取,一些音乐文件
* urllib库:urllib是python提供的一个用于操作URL的模块,我们爬取网页的时候,经常要用到这个库
1. urllib.request模块是用来打开和读取URLS的;
2. urllib.parse模块包含了一些解析URLs的方法;
3. urllib.error模块包含一些有urllib.request产生的错误,可以使用try进行捕捉
4. urllib.robotparser模块用来解析robots.txt文本文件.它提供了一个单独的RobotFileParser类,通过该类提供的can_fetch()方法测试爬虫是否可以下载一个页面。
runoob爬虫改进版
'''
from urllib import request,parse
import random
url="http://www.runoob.com"
query_obj={"s":"js"}
query_string=parse.urlencode(query_obj)
url=url+"/?"+query_string
print(url)
# 构建Request请求
req=request.Request(url) # 如果我们要想模拟浏览器发送GET请求,就需要使用Reques对象,通过往Request对象添加HTTP头,我们就可以把请求伪装成浏览器
# 获取头部信息
# print(req.headers['User-agent'])
# print(req.get_header('User-agent'))
# 以下都是浏览器
ua_list=[
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
]
# 随机添加/修改User-Agent,User-Agent (浏览器名称)
user_agent=random.choice(ua_list)
# 往Request对象添加HTTP头,我们就可以把请求伪装成浏览器
req.add_header('User-Agent',user_agent)
with request.urlopen(req) as f:
if f.status==200:
data=f.read() # 此时数据为字节码
try:
with open('aaa.html','w+',encoding='utf-8') as fp:
fp.write(data.decode())
fp.close()
except Exception as e:
print(e)
else:
print(1)
'''
怎么爬取https协议网站数据
'''
from urllib import request,parse
import random
import ssl
# 忽略ssl证书验证请求
ssl._create_default_https_context=ssl._create_unverified_context
# 随机拿一个身份牌
ua_list=[
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
]
# 要爬的网址
url_0="https://s.taobao.com/search?q=%E6%89%8B%E6%9C%BA&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&"\
"ie=utf8&initiative_id=tbindexz_20170306&cps=yes&ppath=2176%3A39862256&p4ppushleft=5%2C48&"
def loadData(start,end):
for i in range(start,end+1,1):
taobaoSpider(i)
def taobaoSpider(i):
qs={
"s":i*48
}
url=url_0
url=url+parse.urlencode(qs) # 将键值对转换为"key=value"这样的字符串
req=request.Requset(url)
user_agent=random.choice(ua_list)
req.add_header("User-Agent",user_agent)
with request.urlopen(req) as f:
if f.status == 200:
data=f.read().decode()
filename="taobaoPage"+str(i)+".html"
saveFile(filename,data)
def saveFile(filename,data):
try:
with open(filename,'w+',encoding='utf-8') as fp:
fp.write(data)
fp.close()
except Exception as e:
print(e)
if __name__=='__main__':
start=0
end=6
loadData(start,end)
'''
百度-有头
'''
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
browser=webdriver.Chrome()
try:
browser.get("https://www.baidu.com")
input=browser.find_element_by_id("kw")
input.send_keys("Python")
input.send_keys(Keys.ENTER)
wait=WebDriverWait(brower,10)
wait.until(EC.presence_of_element_located((By.ID,"content_left")))
print(browser.current_url)
print(browser.get_cookies())
print(browser.page_source)
time.sleep(10)
finally:
browser.close()
'''
百度-无头
'''
from selenium.webdriver.chrome.options import Options
...
try:
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
'''
* 页面等待
* 隐式等待是等待特定的时间,显式等待是指定某一条件直到这个条件成立时继续执行。
'''
import time
from selenium.webdriver.chrome.options import Options
try:
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
# 隐式等待
browser.implicitly_wait(10) # seconds
url='https://s.taobao.com/search?q=&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180914&ie=utf8'
browser.get(url)
input=browser.find_element_by_id("q")
input.send_keys("nike")
input.send_keys(Keys.ENTER)
wait=WebDriverWait(browser,10)
# wait.until(EC.presence_of_element_located((By.ID,"content_left")))
print(browser.current_url)
print(browser.get_cookies())
# print(browser.page_source)
with open('taobao.html','w+') as fp:
fp.write(browser.page_source)
fp.close()
time.sleep(10)
finally:
browser.close()
'''
案例:模拟豆瓣登录
'''
import time
from selenium.webdriver.chrome.options import Options
try:
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
# 隐式等待
browser.get("http://www.douban.com")
# 输入账号密码
browser.find_element_by_name("form_email").send_keys("13812790420")
browser.find_element_by_name("form_password").send_keys("******")
# 模拟点击登录
browser.find_element_by_xpath("//input[@class='bn-submit']").click()
# 等待3秒
time.sleep(3)
# 生成登陆后快照
browser.save_screenshot("douban.png")
finally:
browser.close()
'''
连接mongodb数据库
'''
from pymongo import MongoClient
uri='mongodb://root:[email protected]:27017/jumei'
try:
# 建立Mongodb数据库连接
client=MongoClient(uri)
# 连接所需数据库,jumei为数据库名
db=client.jumei
# 连接所用集合,也就是我们通常所说的表,goods为表名
collention=db.goods
# 接下来就可以用collention来完成对数据库表的一些操纵
# 查找集合中所有数据
for item in collection.find():
print(item)
except Exception as e:
print(e)
'''
Scrapy