也可以修改文档树
html_doc = """
The Dormouse's story
$37
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
import re
# 第一个参数指定文本内容,第二个参数指定解析器(lxml容错性较强)
soup = BeautifulSoup(html_doc, 'lxml')
# 从文件里面获取文本内容
# soup=BeautifulSoup(open('a.html','r',encoding='utf-8'),'lxml')
# 默认把没有闭合的标签给添加闭合标签
html_doc=soup.prettify()
# 从整篇文档书中找到一个标签从(从soup文档中找到一个a标签)
print(soup.a.text)
# 找到以后是字典格式,下面进取值
print(soup.a.attrs['href'])
# 嵌套选择
print(soup.p.b.text)
# 查找p的一个子节点
print(soup.p.contents)
# 找到所有的子节点,是个迭代器
print(list(soup.p.children))
# 所有的子孙节点
print(list(soup.p.descendants))
# 找到a标签的父标签里面的全部内容
print(soup.a.parent)
print(list(soup.a.parents))
# 搜索文档树之五种过滤器:
# 1、字符串
print(soup.find_all(name='a'))
print(soup.find_all(attrs={"class":"sister"}))
print(soup.find_all(text="The Dormouse's story"))
print(soup.find_all(name='b',text="The Dormouse's story"))
print(soup.p.find(name='b').text)
print(soup.find(name='p',attrs={'class':'story'}).find_all(name='a')[1])
# 2、正则
print(soup.find_all(name=re.compile('^b')))
print(soup.find_all(attrs={'id':re.compile('link')}))
print(soup.find_all(text=re.compile('\$')))
# 3、列表(满足一项即可)
print(soup.find_all(name=['a',re.compile('^b')]))
print(soup.find_all(text=[re.compile('\$'),]))
# 4、True
print(soup.find_all(name=True))
print(soup.find_all(attrs={"id":True}))
print(soup.find_all(name='p',attrs={"id":True}))
# 5、方法
def has_class_not_id(tag):
res=(tag.name == 'a' and tag.has_attr("class") and not tag.has_attr('id'))
return res
print(soup.find_all(name=has_class_not_id,))
# limit 显示几个a标签
print(soup.find_all(name='a',limit=5))
# 从当前标签下面的子标签中找
print(soup.body.find_all(attrs={"class":'sister'},recursive=True))
# print(soup.find_all(name=,attrs=,limit=,text=,recursive=))
print(soup.find_all(attrs={'class':'sister'}))
print(soup.find_all(class_='sister'))
print(soup.find_all(id="link3"))
from gevent import joinall, spawn, monkey
monkey.patch_all()
import requests
# 获取计算机内部进程序号
from threading import current_thread
import time
def parse_page(res):
print('%s PARSE %s' %(current_thread().getName(), len(res)))
def get_page(url, callback=parse_page):
print('%s get %s' %(current_thread().getName(), url))
response = requests.get(url)
if response.status_code == 200:
callback(response.text)
if __name__ == '__main__':
urls = [
'https://www.baidu.com',
'https://www.taobao.com',
'https://www.openstack.org',
]
tasks = []
for url in urls:
# 异步提交任务
tasks.append(spawn(get_page, url))
joinall(tasks)
aiohttp封装了aspncio里面的方法
# import asyncio
#
# @asyncio.coroutine
# def task(task_id, senconds):
# print('%s run' %task_id)
# yield from asyncio.sleep(senconds)
# print('%s done' %task_id)
#
# if __name__ == '__main__':
# tasks = [
# task('任务1', 3),
# task('任务2', 2),
# task('任务3', 1),
# ]
# loop = asyncio.get_event_loop() # 循环取出(与下面的语句连用)
# loop.run_until_complete(asyncio.wait(tasks))
# loop.close()
import requests
import asyncio
import uuid
User_Agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
def parse_page(res):
with open('%s.html' % uuid.uuid1(), 'wb') as f:
f.write(res)
@asyncio.coroutine
# 使用asyncio发送请求必须把ip端口号,路由,
def get_page(host, port=80, url='/', ssl=False, callback=parse_page):
# 1.建立链接
if ssl:
port = 443
print('下载:https:%s:%s:%s' % (host, port, url))
recv, send = yield from asyncio.open_connection(host=host, port=port, ssl=ssl)
# 2.封装请求头
request_headers = """GET %s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\n\r\n""" % (url, host, User_Agent)
request_headers = request_headers.encode('utf-8')
# 3.发送请求头
send.write(request_headers)
yield from send.drain()
# 4.接收响应头
while True:
line = yield from recv.readline()
if line == b'\r\n':
break
print('%s响应头: %s' % (host, line))
# 5.接收响应体
text = yield from recv.readline()
print(text)
# 6.执行回调函数完成解析
callback(text)
# 7.关闭
send.close()
if __name__ == '__main__':
tasks = [
get_page(host='www.baidu.com', url='/s?wd=美女', ssl=True),
get_page(host='www.cnblogs.com', url='/linhaifeng/articles/7806303.html', ssl=True)
]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
种类 | 用法 | 属性 |
---|---|---|
uuid1 | uuid.uuid1(node clock_seq) | 1.基于时间戳,主机id,序列号和当前时间生成uuid(保证全球唯一) 2.有两个参数,node未指定:自动调用(getnode()函数来获取主机的(mac)地址) clock_seq 参数未指定:系统会使用一个随机产生的14位序列号来代替 |
uuid3 | uuid.uuid3(namespace, name) | 通过计算机名字和命名空间的md5散列值得到,保证了同一命名空间中不同名字的唯一性,(和不同命名空间的唯一性)但是:同一命名空间的同一名字生成相同的id |
uuid4 | uuid.uuid4() : 基于随机数 | 通过随机数来生成uuid,使用的是伪随机数,有一定的重复概率 |
uuid5 | uuid.uuid5(namespace, name) | 通过计算机命名空间的名字的SHA-1散列值来生成uuid,算法与uuid.uuid3()相同 |
总结: