第 1 部分 创建爬虫
第 2 部分 高级网页抓取
# 创建虚拟环境 scrapingEnv
$ sudo virtualenv --no-site-packages -p /usr/bin/python3 scrapingEnv
# 激活并使用
$ cd scrapingEnv/
$ source bin/activate
# 安装 BeautifulSoup
$ which python
$ sudo [/home/jax/Documents/virtualenvProject/scrapingEnv/bin/python] -m pip install beautifulsoup4
# 退出环境
$ deactivate
$ sudo pip3 install -i http://mirrors.aliyun.com/pypi/simple beautifulsoup4 --trusted-host mirrors.aliyun.com
bs = BeautifulSoup(html.read(), 'lxml')
bs = BeautifulSoup(html.read(), 'lxml')
find
和 find_all
或 直接调用子标签获取的一列对象或单个对象bs.div.h1
children()
descendants()
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'lxml')
for child in bs.find('table', {'id': 'giftList'}).children:
print(child)
next_siblings()
:调用对象 后面 的兄弟标签previous_siblings()
next_sibling
previous_sibling
from urllib.requeseet import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'lxml')
for sibling in bs.find('tables', {'id': 'giftList'}).tr.next_siblings:
print(sibling)
parent
parents
from urllib.requeseet import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'lxml')
print(bs.find('img',
{'src': '../img/gifts/img1.jpg'})
.parent.previous_sibling.get_text())
.get_text()
会清楚正在处理的 HTML 文档中的所有标签.get_text()
find_all(tag, attributes, recursive, text, limit, keywords)
find(tag, attributes, recursive, text, keywords)
标签参数 tag
:可以传递一个标签的名称或多个标签名称组成的列表做标签参数
.find_all(['h1', 'h2', 'h3'])
属性参数 attributes
:用一个 字典 封装一个标签的 若干属性 和 对应的属性值
.find_all('span', {'class': {'green', 'red'}})
递归参数 recursive
:布尔变量(默认为 True)
find_all
只查找文档的一级标签find_all
根据要求去查找标签参数的所有子标签,以及子标签的子标签文本参数 text
:用标签的文本内容去匹配,而不是用标签的属性
.find_all(text='the prince')
范围限制参数 limit
:获取网页中的前 x 项结果
关键参数 key
:可以让你选择那些具有指定属性的标签
title = bs.find_all(id='title', class_='text')
keyword
是一个冗余的 BeautifulSoup 功能from urllib.requeseet import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'lxml')
images = bs.find_all('img',
{'src': re.compile('\.\.\/img\/gifts\/img.*\.jpg')})
for image in images:
print(image['src'])
对于一个标签对象,可以用以下的代码获取它的全部属性(返回的为 字典对象)
myTag.attrs
myImgTag.attrs['src']
Lambda 表达式 本质上是一个函数,可以作为 变量 传入另一个函数
BeautifulSoup 允许把特定类型的函数作为参数传入 find_all()
bs.find_all(lambda tag: len(tag.attrs) == 2)
bs.find_all(lambda tag: tag.get_text() == 'Or may be he\'s only resting?')
通过
{}
和:
来替代传统%
方式
P30:html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
使用 {}
来对需要插入的位置进行标记,并且默认首个为 0,依次递增
print("This is {}, and it's price is {}".format('apple', 2))
print("This is {1}, and it's price is {0}".format(2, 'apple'))
print("This is {1}, and {1}'s price is {0}".format(2, 'apple'))
使用 列表:*列表
或者 索引:[]
goods_list = ['apple', 2]
print("This is {0}, and it's price is {1}".format(*goods_list))
print("This is {0[0]}, and it's price is {0[1]}".format(goods_list))
使用 字典:**字典
goods_dic = {'goods': 'apple', 'price': 2}
print("This is {goods}, and it's price is {price}".format(goods='apple', price=2))
print("This is {goods}, and it's price is {price}".format(**goods_dic))
使用 f"xxxx"
来格式化字符串
goods = 'apple'
price = 2
print(f"This is {goods}, and it's price is {price}")
:
实现 填充字符、格式化、进制 和 精度
填充与格式化
text = "test"
print(f"{text:*>10}") // ******test
print(f"{text:*<10}") // test******
print(f"{text:*^10}") // ***test***
精度与进制
num1 = 1/3
num2 = 10
num3 = 1342143215432
print(f"{num1:.3f}") // 0.333
print(f"{num2:b}") // 1010
print(f"{num2:o}") // 12
print(f"{num2:x}") // a
print(f"{num3:,}") // 1,342,143,215,432
random.seed(datetime.datetime.now())
random.randint(0, len(links) - 1)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
def getLinks(pageUrl):
global pages
html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
bs = BeautifulSoup(html, 'html.parser')
try:
print(bs.h1.get_text())
print(bs.find(id='mw-content-text').find_all('p')[0])
print(bs.find(id='ca-edit').find('span')
.find('a').attrs['href'])
except AttributeError:
print("页面缺少一些属性")
for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
newPage = link.attrs['href']
print("-" * 20)
print(newPage)
pages.add(newPage)
getLinks(newPage)
getLinks('')
重定向使得 Web 服务器可以将一个域名或者 URL 指向不同位置的内容
r = requests.get('http://github.com', allow_redirects=True)
运行失效
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random
pages = set()
allExtLinks = set()
allIntLinks = set()
random.seed(datetime.datetime.now())
# 获取页面中所有内链的列表
def getInternalLinks(bs, includeUrl):
includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme,
urlparse(includeUrl).netloc)
internalLinks = []
# 找出所有以"/"开头的链接
for link in bs.find_all('a',
href=re.compile('^(/|.*' + includeUrl + ')')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if(link.attrs['href'].startswith('/')):
internalLinks.append(includeUrl + link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
# 获取页面中所有外联的列表
def getExternalLinks(bs, excludeUrl):
externalLinks = []
# 找出所有以"http"或"www"开头且不包含当前 URL 的链接
for link in bs.find_all('a',
href=re.compile('^(http|www)((?!' + excludeUrl + ').)*$')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bs = BeautifulSoup(html, 'html.parser')
externalLinks = getExternalLinks(bs,
urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print('No external links, looking around the site for one')
domain = '{}://{}'.format(urlparse(startingPage).scheme,
urlparse(startingPage).netloc)
internalLinks = getInternalLinks(bs, domain)
return getRandomExternalLink(internalLinks[random.randint(0,
len(internalLinks) - 1)])
else:
return externalLinks[random.randint(0, len(externalLinks) - 1)]
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
print('Random external link is: {}'.format(externalLink))
followExternalOnly(externalLink)
def getAllExternalLinks(siteUrl):
html = urlopen(siteUrl)
domain = '{}://{}'.format(urlparse(siteUrl).scheme,
urlparse(siteUrl).netloc)
bs = BeautifulSoup(html, 'html.parse')
internalLinks = getInternalLinks(bs, domain)
externalLinks = getExternalLinks(bs, domain)
for link in externalLinks:
if link not in allExtLinks:
allExtLinks.add(link)
print(link)
for link in internalLinks:
if link not in allIntLinks:
allIntLinks.add(link)
getAllExternalLinks(link)
allIntLinks.add('http://orilly.com')
getAllExternalLinks('http://oreilly.com')
# 安装 Scrapy
$ pip3 install -i http://mirrors.aliyun.com/pypi/simple Scrapy --trusted-host mirrors.aliyun.com
# 初始化
$ scrapy startproject [projectName]
# 创建蜘蛛
$ vim [projectName]/spiders/[spider].py
# 运行蜘蛛
$ cd [projectName]/spiders
$ scrapy runspider [spider].py
start_requests 函数
是 Scrapy 定义的 程序入口,用于生成 Scrapy 用来抓取网站的 Request 对象parse 函数
是一个用户自定义的 回调函数,通过 callback=self.parse
传递给 Request 对象CrawlSpider 类
只获取文件 URL 链接
直接把源文件下载下来
使用 urllib.request.urlretrieve
从远程 URL 下载
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
imageLocation = bs.find('a', {'id': 'logo'}).find('img')['src']
urlretrieve(imageLocation, 'log.jpg')
具体代码:P72
CSV(comma-separated values,逗号分隔值)
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://en.wikipedia.org/wiki/'
'Comparison_of_text_editors')
bs = BeautifulSoup(html, 'html.parser')
# 主对比表格是当前页面上的第一个表格
table = bs.findAll('table', {'class': 'wikitable'})[0]
rows = table.findAll('tr')
csvFile = open('editors.csv', 'wt+')
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()
随便写,建议用 MongoDB 等 文件型数据库
# 安装 MySQL
$ sudo apt-get install mysql-server
# 与 Python 整合
$ pip3 install PyMySQL
对计网的知识的运用
smtplib
和 email
email 模块
包含了许多实用的邮件格式设置函数,可以用来创建邮件“包裹”
smtplib 模块
用来设置服务器连接的相关信息
import smtplib
from email.mime.text import MIMEText
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time
def sendMail(subject, body):
msg = MIMEText(body)
msg['Subject'] = subject
msg['From'] = '[email protected]'
msg['To'] = '[email protected]'
s = smtplib.SMTP('localhost')
s.send_message(msg)
s.quit()
bs = BeautifulSoup(urlopen('https://isitchristmas.com/'), 'html.parser')
while bs.find('a', {'id': 'answer'}).attrs['title'] == 'NO':
print('It si not Christmas yet.')
time.sleep(3600)
bs = BeautifulSoup(urlopen('https://isitchristmas.com/'), 'html.parser')
sendMail('It\'s Christmas!',
'According to https://isitchristmas.com, it is Christmas!')