一、正则数据解析
import requests
from re import findall
import csv
from threading import Thread
from queue import Queue
#
# 知乎
def get_data():
# 获取数据
headers = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
'Cookie': 'q_c1=e9c0fc9936bb43a9843e7c42cb3e6606|1598837492000|1598837492000; _zap=9c31d477-d8f7-4b4c-9569-10ce524b747f; _xsrf=1AFWm2YY8bPlhvAUno70CULPv74aWf2n; d_c0="AECXX3ic0BGPThwqzgDL2X5bYsQqEf8R-L4=|1598837437"; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1598837438; _ga=GA1.2.1020087299.1598837439; _gid=GA1.2.1083655796.1598837439; capsion_ticket="2|1:0|10:1598837455|14:capsion_ticket|44:YjBmOGViNDhhNjc0NGQwNGFiNjIwNzMxYWRmMzNkNjc=|954713d02354a83bb5b9b68aba03960433094b253c9d81562d402a23690405c4"; z_c0="2|1:0|10:1598837490|4:z_c0|92:Mi4xaW5CWUdRQUFBQUFBUUpkZmVKelFFU1lBQUFCZ0FsVk44cUE1WUFBeGVaMFh5YVNwajRyN3FFaDF3Xy1rQ0kzanRn|f02fb78cc4fde9903ca8074b7752f0b59f2196b8a458f19f6f17c4f1e2badf4e"; unlock_ticket="ADAc3rNA2xAmAAAAYAJVTfpZTF8eeZGyZPAsyJWjNIukQ5J4yqoyVw=="; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1598837520; SESSIONID=e9uJa4vdbp1NourUuiQYWFSbTkyDm9oSA96SosNYcG5; JOID=UlsRBU20Xboa6HYaUbL452WQ7M9E0mPNUI9BazfQLoxWhwVrA6IL9EPtch9TXUeToE63P2LXlYrWjgBYgnUkIC8=; osd=V1sXAU-xXbwe6nMaV7b64mWW6M1B0mXJUopBbTPSK4xQgwduA6QP9kbtdBtRWEeVpEyyP2TTl4_WiARah3UiJC0=; KLBRSID=53650870f91603bc3193342a80cf198c|1598837706|1598837436'
}
response = requests.get('https://www.zhihu.com/', headers=headers)
# 解析数据
print(response.text)
re_str = r'(.+?)'
# 获取所有标题
print(findall(re_str, response.text))
# 获取所有的赞同数
re_str = r'赞同(.+?)'
print(findall(re_str, response.text))
re_str = r'"comment_count":(.+?),'
print(findall(re_str, response.text))
def get_jiepai():
header = {
'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
}
url = 'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis×tamp=1598840365034&_signature=EeE9wAAgEBAhM65JM0VVJhHgfNAAE6yx9JqQafmWS3C-vrdduSBTwaXD7nAun8UsS25xGLJiQHARyYdKsUB73PH0NcbAS308Kedzm9KELmE9UFYfaDuEHt3aov7a-CdwyIJ'
response = requests.get(url, headers=header)
response.encoding = 'utf-8'
print(response.text)
# 多线程获取豆瓣Top250的数据
def get_url_data(url, q: Queue):
headers = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
}
response = requests.get(url, headers=headers)
re_str = r'(?s)([^&].*?).+? '
q.put(findall(re_str, response.text))
def douban_thread():
movies_q = Queue()
start = -25
all_thread = []
for _ in range(10):
start += 25
url = 'https://movie.douban.com/top250?start=' + str(start)
t = Thread(target=get_url_data, args=(url, movies_q))
t.start()
all_thread.append(t)
for t in all_thread:
t.join()
# 写数据
all_movies = []
for _ in range(len(all_thread)):
all_movies += movies_q.get()
all_movies.sort(reverse=True, key=lambda item: float(item[1]))
new_all_movies = map(lambda item: (all_movies.index(item)+1,)+item, all_movies)
with open('files/豆瓣电影Top250.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(['排名', '电影名称', '评分'])
writer.writerows(new_all_movies)
# print(all_movies)
# 单线程获取豆瓣Top250的数据
def douban():
all_movies = []
start = -25
for _ in range(10):
start += 25
url = 'https://movie.douban.com/top250?start='+str(start)
headers = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
}
# 获取网络数据
response = requests.get(url, headers=headers)
# print(response.text)
# 解析数据
re_str = r'(?s)([^&].*?).+? '
all_movies += findall(re_str, response.text)
# 添加排名信息
new_all_movies = map(lambda item: (all_movies.index(item)+1,)+item, all_movies)
# 保存数据
with open('files/豆瓣电影.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(['排名', '电影名称', '评分'])
writer.writerows(new_all_movies)
if __name__ == '__main__':
douban_thread()
二、bs4的使用
from bs4 import BeautifulSoup
html = """
The Dormouse's story
我是段落1我是font1我是font2
hello world!
The Dormouse's story1
我是span中的font
The Dormouse's story2
Once upon a time there were three little sisters; and their names were
,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
# 1.创建指定页面对应的解析器
# BeautifulSoup(需要解析的文档数据, 解析器类型-lxml)
soup = BeautifulSoup(html, 'lxml')
print(type(soup)) #
# 会自动不全网页标签
print(soup)
# 获取字符串的补全内容
print(type(soup.prettify())) #
print(soup.prettify())
# 2.标签选择器
# 解析器对象.标签名 - 解析器对象中第一个指定标签
print('======================标签选择器================')
# 获取soup对应的页面中第一个title标签
print(soup.title)
# 获取soup对应的页面中第一个p标签
print(soup.p)
# 获取soup中第一个p标签中的第一个font标签
print(soup.p.font)
# 3.获取标签名、标签属性和标签内容
print('====================获取标签名、标签属性和标签内容====================')
# 1)标签名: 标签对象.name
print(soup.title.name)
# 2)标签属性:
# 标签对象.attrs - 获取指定标签所有的属性和值对应的字典
# 标签对象.attrs[属性名]
print(soup.a.attrs)
print(soup.a.attrs['href'], soup.a.attrs['class'])
# 3)标签内容
# 标签对象.string - 获取标签中的文本内容(如果内容是标签返回子标签中的文本内容,如果文本和子标签同时存在返回None)
# 标签对象.get_text() - 获取标签中的文本内容(如果有子标签,只获取子标签中的文本信息)
# 标签对象.contents - 以列表的形式返回标签内容(列表中的元素是文本和子标签)
print(soup.span.string) # 我是span
print(soup.p.string) # None
print(soup.span.get_text()) # 我是span
print(soup.p.get_text()) # 我是段落1我是font1
print(soup.span.contents) # ['我是span']
print(soup.p.contents) # ['我是段落1', 我是font1, 我是font2]
# 4. 子节点和子孙节点
# 标签对象.children
print('==========================子节点和子孙节点=================')
# 子标签
# print(list(soup.div.children))
print([item for item in soup.div.children if not isinstance(item, str)])
# 子孙标签
# print(list(soup.div.descendants))
print([item for item in soup.div.descendants if not isinstance(item, str)])
# 补充: isinstance(数据, 类型) - 判断指定数据是否是指定类型,是返回True
# 5.获取父节点和祖先节点
# 标签对象.parent
# 标签对象.parents
print('==========================获取父节点和祖先节点=========================')
input = soup.input
print(input.parent)
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
for x in input.parents:
print(x)
print('--------------------------------------------\n')
# 6.兄弟标签
# 标签对象.next_siblings
# 标签对象.previous_siblings
print('==========================兄弟标签=======================')
print(list(soup.button.next_siblings))
print(list(soup.button.previous_siblings))
# 7.标准选择器 - 按照标签名选中标签、属性值选中标签、标签内容选中标签
# 根据标签名查找标签:解析器对象/标签对象.find_all(标签名)
# 根据指定属性值查找标签: 解析器对象/标签对象.find_all(attrs={属性名: 属性值})
# 根据标签内容查找内容:解析器对象/标签对象.find_all(text=内容) (没有什么用!)
print('==========================标准选择器=======================')
# 获取所有的p标签
all_p = soup.find_all('p')
print(len(all_p))
print(all_p)
print()
a = soup.find_all(attrs={
'href': 'http://example.com/lacie'})
print(a)
p = soup.find_all(text='hello')
print(p)
# 8. CSS选择器
# 解析器对象/标签对象.select(css选择器)
print(soup.select('.title'))
print(soup.select('p>b'))