python简单爬取数据

python简单爬取数据

1 使用selenium和BeautifulSoup爬取数据

1.1 说明

  1. 获取单个页面;
  2. 使用“必应”引擎检索的数据;
  3. 解析html中的标签;

1.2 源代码


import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service


def get_text_page():

    page_url = "https://www.oracle.com/artificial-intelligence/what-is-ai/"

    driver = webdriver.Chrome(service=Service("C:/Program Files/Google/Chrome/Application/chromedriver.exe"))
    driver.get(page_url)

    # 设置睡眠时间
    time.sleep(3)

    # 设置编码方式, bytes
    content = driver.page_source.encode('utf-8')

    # 使用Soupb解析html
    dom_bs = BeautifulSoup(content, 'lxml')

    # 获取页面中的纯文本
    print(dom_bs.text)

    driver.close()


get_text_page()


# 进入必应首页并搜索关键词
def driver_open(key_word):
    # 使用关键词获取数据
    url = "https://cn.bing.com/search?q="+key_word+"&ensearch=1&FORM=BESBTB"
    driver = webdriver.Chrome(service=Service("C:/Program Files/Google/Chrome/Application/chromedriver.exe"))
    driver.get(url)

    # 设置睡眠时间,等待浏览器访问数据
    time.sleep(2)

    # 设置编码方式, bytes
    content = driver.page_source.encode('utf-8')

    # 使用Soupb解析html
    dom_bs = BeautifulSoup(content, 'lxml')
    # 关闭浏览器
    driver.close()

    # 解析返回结果,ResultSet
    li_list_rs = dom_bs.find_all(name="li", attrs={"class":"b_algo"})

    for li_tag in li_list_rs:
        # 解析li
        li_bs = BeautifulSoup(str(li_tag), 'lxml')

        # 获取标题头
        li_h2_rs = li_bs.find_all(name="h2")
        ls_h2_bs = BeautifulSoup(str(li_h2_rs[0]), 'lxml')

        # 查找a标签
        li_h2_a_rs = ls_h2_bs.find_all(name="a")
        h2_a_tag = li_h2_a_rs[0]

        # 获取标签中的连接地址和文本
        print(h2_a_tag.attrs["href"])
        print(h2_a_tag.text)

    return ""


driver_open("text")

2使用trafilatura爬取网络数据

2.1 说明

  1. 比较两个文本的相似度;
  2. 获取网页中的url地址;
  3. 获取html中的数据;

2.2 源代码

# trafilatura的参考地址
# https://trafilatura.readthedocs.io/en/latest/index.html


# 1 比较文本相似度
from trafilatura.hashing import Simhash
first = Simhash("This is a text.")
second = Simhash("This is a test.")
print(second.similarity(first))


# 2 爬取数据
from trafilatura.spider import focused_crawler

homepage = 'https://blog.csdn.net/make_progress?type=blog'
# 爬取网页中的url地址
to_visit, known_urls = focused_crawler(homepage, max_seen_urls=2, max_known_urls=10)
print(to_visit)
print(known_urls)


# 3 获取单个页面的数据
import trafilatura
response = trafilatura.fetch_url('https://blog.csdn.net/make_progress?type=blog', decode=False)
# 获取状态码
print(response.status)
print(response.url)
# 获取数据
print(response.data.decode('utf-8', 'ignore'))

你可能感兴趣的:(python,开发语言)