数据解析

一、正则数据解析

import requests
from re import findall
import csv
from threading import Thread
from queue import Queue

#

# 知乎
def get_data():
    # 获取数据
    headers = {
     
        'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
        'Cookie': 'q_c1=e9c0fc9936bb43a9843e7c42cb3e6606|1598837492000|1598837492000; _zap=9c31d477-d8f7-4b4c-9569-10ce524b747f; _xsrf=1AFWm2YY8bPlhvAUno70CULPv74aWf2n; d_c0="AECXX3ic0BGPThwqzgDL2X5bYsQqEf8R-L4=|1598837437"; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1598837438; _ga=GA1.2.1020087299.1598837439; _gid=GA1.2.1083655796.1598837439; capsion_ticket="2|1:0|10:1598837455|14:capsion_ticket|44:YjBmOGViNDhhNjc0NGQwNGFiNjIwNzMxYWRmMzNkNjc=|954713d02354a83bb5b9b68aba03960433094b253c9d81562d402a23690405c4"; z_c0="2|1:0|10:1598837490|4:z_c0|92:Mi4xaW5CWUdRQUFBQUFBUUpkZmVKelFFU1lBQUFCZ0FsVk44cUE1WUFBeGVaMFh5YVNwajRyN3FFaDF3Xy1rQ0kzanRn|f02fb78cc4fde9903ca8074b7752f0b59f2196b8a458f19f6f17c4f1e2badf4e"; unlock_ticket="ADAc3rNA2xAmAAAAYAJVTfpZTF8eeZGyZPAsyJWjNIukQ5J4yqoyVw=="; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1598837520; SESSIONID=e9uJa4vdbp1NourUuiQYWFSbTkyDm9oSA96SosNYcG5; JOID=UlsRBU20Xboa6HYaUbL452WQ7M9E0mPNUI9BazfQLoxWhwVrA6IL9EPtch9TXUeToE63P2LXlYrWjgBYgnUkIC8=; osd=V1sXAU-xXbwe6nMaV7b64mWW6M1B0mXJUopBbTPSK4xQgwduA6QP9kbtdBtRWEeVpEyyP2TTl4_WiARah3UiJC0=; KLBRSID=53650870f91603bc3193342a80cf198c|1598837706|1598837436'
    }
    response = requests.get('https://www.zhihu.com/', headers=headers)

    # 解析数据
    print(response.text)
    re_str = r'(.+?)'
    # 获取所有标题
    print(findall(re_str, response.text))
    # 获取所有的赞同数
    re_str = r'赞同(.+?)'
    print(findall(re_str, response.text))

    re_str = r'"comment_count":(.+?),'
    print(findall(re_str, response.text))


def get_jiepai():
    header = {
     
        'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
    }
    url = 'https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis×tamp=1598840365034&_signature=EeE9wAAgEBAhM65JM0VVJhHgfNAAE6yx9JqQafmWS3C-vrdduSBTwaXD7nAun8UsS25xGLJiQHARyYdKsUB73PH0NcbAS308Kedzm9KELmE9UFYfaDuEHt3aov7a-CdwyIJ'
    response = requests.get(url, headers=header)
    response.encoding = 'utf-8'
    print(response.text)


# 多线程获取豆瓣Top250的数据
def get_url_data(url, q: Queue):
    headers = {
     
        'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    re_str = r'(?s)([^&].*?).+?(.+?)'
    q.put(findall(re_str, response.text))


def douban_thread():
    movies_q = Queue()
    start = -25
    all_thread = []
    for _ in range(10):
        start += 25
        url = 'https://movie.douban.com/top250?start=' + str(start)
        t = Thread(target=get_url_data, args=(url, movies_q))
        t.start()
        all_thread.append(t)

    for t in all_thread:
        t.join()
    # 写数据
    all_movies = []
    for _ in range(len(all_thread)):
        all_movies += movies_q.get()
    all_movies.sort(reverse=True, key=lambda item: float(item[1]))
    new_all_movies = map(lambda item: (all_movies.index(item)+1,)+item, all_movies)
    with open('files/豆瓣电影Top250.csv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['排名', '电影名称', '评分'])
        writer.writerows(new_all_movies)
    # print(all_movies)


# 单线程获取豆瓣Top250的数据
def douban():
    all_movies = []
    start = -25
    for _ in range(10):
        start += 25
        url = 'https://movie.douban.com/top250?start='+str(start)
        headers = {
     
            'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
        }
        # 获取网络数据
        response = requests.get(url, headers=headers)
        # print(response.text)
        # 解析数据
        re_str = r'(?s)([^&].*?).+?(.+?)'
        all_movies += findall(re_str, response.text)
        # 添加排名信息
        new_all_movies = map(lambda item: (all_movies.index(item)+1,)+item, all_movies)

    # 保存数据
    with open('files/豆瓣电影.csv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['排名', '电影名称', '评分'])
        writer.writerows(new_all_movies)


if __name__ == '__main__':
    douban_thread()

二、bs4的使用

from bs4 import BeautifulSoup

html = """
The Dormouse's story

我是段落1我是font1我是font2

hello world!

The Dormouse's story1 我是span中的font

The Dormouse's story2

Once upon a time there were three little sisters; and their names were , Lacie and Tillie; and they lived at the bottom of a well.

...

"""
# 1.创建指定页面对应的解析器 # BeautifulSoup(需要解析的文档数据, 解析器类型-lxml) soup = BeautifulSoup(html, 'lxml') print(type(soup)) # # 会自动不全网页标签 print(soup) # 获取字符串的补全内容 print(type(soup.prettify())) # print(soup.prettify()) # 2.标签选择器 # 解析器对象.标签名 - 解析器对象中第一个指定标签 print('======================标签选择器================') # 获取soup对应的页面中第一个title标签 print(soup.title) # 获取soup对应的页面中第一个p标签 print(soup.p) # 获取soup中第一个p标签中的第一个font标签 print(soup.p.font) # 3.获取标签名、标签属性和标签内容 print('====================获取标签名、标签属性和标签内容====================') # 1)标签名: 标签对象.name print(soup.title.name) # 2)标签属性: # 标签对象.attrs - 获取指定标签所有的属性和值对应的字典 # 标签对象.attrs[属性名] print(soup.a.attrs) print(soup.a.attrs['href'], soup.a.attrs['class']) # 3)标签内容 # 标签对象.string - 获取标签中的文本内容(如果内容是标签返回子标签中的文本内容,如果文本和子标签同时存在返回None) # 标签对象.get_text() - 获取标签中的文本内容(如果有子标签,只获取子标签中的文本信息) # 标签对象.contents - 以列表的形式返回标签内容(列表中的元素是文本和子标签) print(soup.span.string) # 我是span print(soup.p.string) # None print(soup.span.get_text()) # 我是span print(soup.p.get_text()) # 我是段落1我是font1 print(soup.span.contents) # ['我是span'] print(soup.p.contents) # ['我是段落1', 我是font1, 我是font2] # 4. 子节点和子孙节点 # 标签对象.children print('==========================子节点和子孙节点=================') # 子标签 # print(list(soup.div.children)) print([item for item in soup.div.children if not isinstance(item, str)]) # 子孙标签 # print(list(soup.div.descendants)) print([item for item in soup.div.descendants if not isinstance(item, str)]) # 补充: isinstance(数据, 类型) - 判断指定数据是否是指定类型,是返回True # 5.获取父节点和祖先节点 # 标签对象.parent # 标签对象.parents print('==========================获取父节点和祖先节点=========================') input = soup.input print(input.parent) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') for x in input.parents: print(x) print('--------------------------------------------\n') # 6.兄弟标签 # 标签对象.next_siblings # 标签对象.previous_siblings print('==========================兄弟标签=======================') print(list(soup.button.next_siblings)) print(list(soup.button.previous_siblings)) # 7.标准选择器 - 按照标签名选中标签、属性值选中标签、标签内容选中标签 # 根据标签名查找标签:解析器对象/标签对象.find_all(标签名) # 根据指定属性值查找标签: 解析器对象/标签对象.find_all(attrs={属性名: 属性值}) # 根据标签内容查找内容:解析器对象/标签对象.find_all(text=内容) (没有什么用!) print('==========================标准选择器=======================') # 获取所有的p标签 all_p = soup.find_all('p') print(len(all_p)) print(all_p) print() a = soup.find_all(attrs={ 'href': 'http://example.com/lacie'}) print(a) p = soup.find_all(text='hello') print(p) # 8. CSS选择器 # 解析器对象/标签对象.select(css选择器) print(soup.select('.title')) print(soup.select('p>b'))

你可能感兴趣的:(python)