爬虫汇总

爬虫整理

最简单的案例

'''
	from urllib import request

	with request.urlopen("http://www.runoob.com") as f:
		if f.status==200: #200 f.status返回状态码;f.reason:OK
		data=f.read() # 读取返回的主体内容,赋值给data,此时数据格式为字节码
		# print(data.decode())
		print(f.getheaders()) # 读取返回的头信息,头信息格式为元组列表
		for k,v in f.getheaders():
			print(k,v)
			try:
				with open('first.html','w+') as fp:
					fp.write(data.decode())
					fp.close()
			except Exception as e:
				print(e)
'''

* 涉及知识点:

	* 读取内容常见的有三种方式,
		1. read()读取文件的全部内容,与readline()不同的是,read()会把读取到的内容赋给一个字符串常量。
		2. readlines()读取文件的全部内容,readlines()会把读取到的内容赋给一个列表变量。
		3. readline()读取文件的一行内容。
		
	* 文件读写
		1. w 写模式,它是不能读的,如果用w模式打开一个已经存在的文件,会清空以前的文件内容,重新写
		    w+ 是读写内容,只要沾上w,肯定会清空原来的文件

		2. r 读模式,只能读,不能写,而且文件必须存在
		    r+ 是读写模式,只要沾上r,文件必须存在

		3. a 追加模式,也能写,在文件的末尾添加内容
 
		4. rb+、wb+、ab+,这种是二进制模式打开或者读取,一些音乐文件
		

* urllib库:urllib是python提供的一个用于操作URL的模块,我们爬取网页的时候,经常要用到这个库

	1. urllib.request模块是用来打开和读取URLS的;

	2. urllib.parse模块包含了一些解析URLs的方法;

	3. urllib.error模块包含一些有urllib.request产生的错误,可以使用try进行捕捉
	
	4. urllib.robotparser模块用来解析robots.txt文本文件.它提供了一个单独的RobotFileParser类,通过该类提供的can_fetch()方法测试爬虫是否可以下载一个页面。

runoob爬虫改进版

'''
	from urllib import request,parse
	import random

	url="http://www.runoob.com"
	query_obj={"s":"js"}
	query_string=parse.urlencode(query_obj)
	url=url+"/?"+query_string
	print(url)
	# 构建Request请求
	req=request.Request(url)  # 如果我们要想模拟浏览器发送GET请求,就需要使用Reques对象,通过往Request对象添加HTTP头,我们就可以把请求伪装成浏览器
	# 获取头部信息
	# print(req.headers['User-agent'])
	# print(req.get_header('User-agent'))
	# 以下都是浏览器
	ua_list=[
	    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
	    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
	    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
	    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
	    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
	    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
	    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
	    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
	    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
	    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
	    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
	    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
	    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
	    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
	    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
	    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
	]
	# 随机添加/修改User-Agent,User-Agent (浏览器名称)
	user_agent=random.choice(ua_list)
	# 往Request对象添加HTTP头,我们就可以把请求伪装成浏览器
	req.add_header('User-Agent',user_agent)
	with request.urlopen(req) as f:
		if f.status==200:
			data=f.read() # 此时数据为字节码
			try:
				with open('aaa.html','w+',encoding='utf-8') as fp:
					fp.write(data.decode())
					fp.close()
			except Exception as e:
				print(e)
		else:
			print(1)
	
'''

怎么爬取https协议网站数据

'''
	from urllib import request,parse
	import random
	import ssl
	
	# 忽略ssl证书验证请求
	ssl._create_default_https_context=ssl._create_unverified_context
	# 随机拿一个身份牌
	ua_list=[
		"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"
	]
	# 要爬的网址
	url_0="https://s.taobao.com/search?q=%E6%89%8B%E6%9C%BA&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&"\
  "ie=utf8&initiative_id=tbindexz_20170306&cps=yes&ppath=2176%3A39862256&p4ppushleft=5%2C48&"


	def loadData(start,end):
		for i in range(start,end+1,1):
			taobaoSpider(i)


	def taobaoSpider(i):
		qs={
			"s":i*48
		}
		url=url_0
		url=url+parse.urlencode(qs) # 将键值对转换为"key=value"这样的字符串
		req=request.Requset(url)
		user_agent=random.choice(ua_list)
		req.add_header("User-Agent",user_agent)
		with request.urlopen(req) as f:
			if f.status == 200:
			data=f.read().decode()
			filename="taobaoPage"+str(i)+".html"
			saveFile(filename,data)

	def saveFile(filename,data):
		try:
			with open(filename,'w+',encoding='utf-8') as fp:
				fp.write(data)
				fp.close()
		except Exception as e:
			print(e)

	if __name__=='__main__':
		start=0
		end=6
		loadData(start,end)

'''

百度-有头

'''
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.support.wait import WebDriverWait
	import time
	browser=webdriver.Chrome()
	try:
		browser.get("https://www.baidu.com")
		input=browser.find_element_by_id("kw")
		input.send_keys("Python")
		input.send_keys(Keys.ENTER)
		wait=WebDriverWait(brower,10)
		wait.until(EC.presence_of_element_located((By.ID,"content_left")))
		print(browser.current_url)
    	print(browser.get_cookies())
    	print(browser.page_source)
		time.sleep(10)
	finally:
		browser.close()
	
'''

百度-无头

'''
	from selenium.webdriver.chrome.options import Options
	...
	try:
	    chrome_options = Options()
	    chrome_options.add_argument('--headless')
	    chrome_options.add_argument('--disable-gpu')
	    browser = webdriver.Chrome(chrome_options=chrome_options)
	
'''
* 页面等待
	* 隐式等待是等待特定的时间,显式等待是指定某一条件直到这个条件成立时继续执行。

'''
	import time
	
	from selenium.webdriver.chrome.options import Options
	try:
	    chrome_options = Options()
	    chrome_options.add_argument('--headless')
	    chrome_options.add_argument('--disable-gpu')
	    browser = webdriver.Chrome(chrome_options=chrome_options)
	    # 隐式等待
	    browser.implicitly_wait(10)  # seconds
	    url='https://s.taobao.com/search?q=&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180914&ie=utf8'
	    browser.get(url)
	    input=browser.find_element_by_id("q")
	    input.send_keys("nike")
	    input.send_keys(Keys.ENTER)
	    wait=WebDriverWait(browser,10)
	    # wait.until(EC.presence_of_element_located((By.ID,"content_left")))
	    print(browser.current_url)
	    print(browser.get_cookies())
	    # print(browser.page_source)
	    with open('taobao.html','w+') as fp:
	        fp.write(browser.page_source)
	        fp.close()
	    time.sleep(10)
	finally:
	    browser.close()
'''

案例:模拟豆瓣登录

'''
	import time
	
	from selenium.webdriver.chrome.options import Options
	
	try:
	    chrome_options = Options()
	    chrome_options.add_argument('--headless')
	    chrome_options.add_argument('--disable-gpu')
	    browser = webdriver.Chrome(chrome_options=chrome_options)
	    # 隐式等待
	    browser.get("http://www.douban.com")
	
	    # 输入账号密码
	    browser.find_element_by_name("form_email").send_keys("13812790420")
	    browser.find_element_by_name("form_password").send_keys("******")
	
	    # 模拟点击登录
	    browser.find_element_by_xpath("//input[@class='bn-submit']").click()
	
	    # 等待3秒
	    time.sleep(3)
	
	    # 生成登陆后快照
	    browser.save_screenshot("douban.png")
	finally:
	    browser.close()		
'''

连接mongodb数据库

'''
	from pymongo import MongoClient

	uri='mongodb://root:[email protected]:27017/jumei'
	try:
		# 建立Mongodb数据库连接
		client=MongoClient(uri)
		# 连接所需数据库,jumei为数据库名
		db=client.jumei
		# 连接所用集合,也就是我们通常所说的表,goods为表名
		collention=db.goods
		# 接下来就可以用collention来完成对数据库表的一些操纵
		# 查找集合中所有数据
		for item in collection.find():
			print(item)
	except Exception as e:
		print(e)

'''

Scrapy

你可能感兴趣的:(爬虫)