冰冠 2018年06月13日15:58:11
系统:kali linux IDE:pycharm2018.1.1 professional Python:Anaconda3(Python3.6)
import builtwith
print(builtwith.parse("http://example.webscraping.com/"),end="\n")
import whois
print(whois.whois('baidu.com'))
6、下载包含有感兴趣数据的网页称为爬取(crawling)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function:
@create 18-6-12 上午10:28"""
import urllib.request, urllib.error
def download(url):
print("Downloading:", url)
try:
html = urllib.request.urlopen(url).read()
except urllib.error.URLError as e:
print("Download error:", e.reason)
html = None
return html
print(download("http://www.baidu.com"))
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function: 爬取网站
@create 18-6-13 下午4:20"""
import urllib.request, urllib.error
def download(url, num_retries=2):
print("Downloading:", url)
try:
html = urllib.request.urlopen(url).read()
except urllib.error.URLError as e:
print("Download error:", e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url,num_retries-1)
return html
download("http://httpstat.us/500")
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function: 爬取网站
@create 18-6-13 下午4:20"""
import urllib.request, urllib.error
def download(url,user_agent='wswp', num_retries=2):
print("Downloading:", url)
headers = {'User-agent':user_agent}
request = urllib.request.Request(url,headers=headers)
try:
html = urllib.request.urlopen(request).read()
except urllib.error.URLError as e:
print("Download error:", e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url,num_retries-1)
return html
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function: 爬取网站
@create 18-6-13 下午4:20"""
import urllib.request, urllib.error,re
def download(url,user_agent='wswp', num_retries=2):
print("Downloading:", url)
headers = {'User-agent':user_agent}
request = urllib.request.Request(url,headers=headers)
try:
html = urllib.request.urlopen(request).read()
except urllib.error.URLError as e:
print("Download error:", e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url,num_retries-1)
return html
# download("http://httpstat.us/500")
def crawl_sitemap(url):
sitemap=download(url)
links = re.findall('(.*?) ',str(sitemap))
for link in links:
html = download(link)
crawl_sitemap('http://example.webscraping.com/sitemap.xml')
有时候网站url中会包含有页面别名,可以对搜索引擎优化起到帮助作用。一般情况下,web服务器会忽略这个字符串,只加载id来匹配数据库中的相关记录
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
@function: urllib爬虫初步
@create 18-6-13 下午4:20"""
import urllib.request, urllib.error, re, itertools
def download(url, user_agent='wswp', num_retries=2):
print("Downloading:", url)
headers = {'User-agent': user_agent}
request = urllib.request.Request(url, headers=headers)
try:
html = urllib.request.urlopen(request).read()
except urllib.error.URLError as e:
print("Download error:", e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, num_retries - 1)
return html
# download("http://httpstat.us/500")
def id_crawl():
max_errors = 5
num_errors = 0
for page in itertools.count(1):
url = 'http://example.webscraping.com/places/default/view/-%d' % page
html = download(url)
if html is None:
num_errors += 1
if num_errors == max_errors:
break
else:
pass
id_crawl()
6.5链接爬虫
对于某些网站,需要让爬虫表现的更像用户,来跟踪链接访问感兴趣的内容。通过跟踪链接的方式,很容易下载整个网站的界面,但是也会下载大量我们并不需要的页面。需要进一步使用正则等方法进行处理
import urllib.parse
# 链接爬虫
def get_links(html):
webpage_regex = re.compile(']+href=["\'](.*?)["\']',re.IGNORECASE)
print("Searching,wait")
return webpage_regex.findall(str(html))
def link_crawler(seed_url,link_regex):
crawl_queue = [seed_url]
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.match(link_regex,link):
link = urllib.parse.urljoin(seed_url,link)
crawl_queue.append(link)
link_crawler('http://example.webscraping.com','(.*?)/(index|view)/')
6.6其他功能
解析robots.txt 限制下载速度 避免爬虫陷阱等
附:练习源码
https://github.com/ice1995/python_web_crawler-/tree/master/day01_crawl