python简单爬取数据
1 使用selenium和BeautifulSoup爬取数据
1.1 说明
- 获取单个页面;
- 使用“必应”引擎检索的数据;
- 解析html中的标签;
1.2 源代码
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
def get_text_page():
page_url = "https://www.oracle.com/artificial-intelligence/what-is-ai/"
driver = webdriver.Chrome(service=Service("C:/Program Files/Google/Chrome/Application/chromedriver.exe"))
driver.get(page_url)
time.sleep(3)
content = driver.page_source.encode('utf-8')
dom_bs = BeautifulSoup(content, 'lxml')
print(dom_bs.text)
driver.close()
get_text_page()
def driver_open(key_word):
url = "https://cn.bing.com/search?q="+key_word+"&ensearch=1&FORM=BESBTB"
driver = webdriver.Chrome(service=Service("C:/Program Files/Google/Chrome/Application/chromedriver.exe"))
driver.get(url)
time.sleep(2)
content = driver.page_source.encode('utf-8')
dom_bs = BeautifulSoup(content, 'lxml')
driver.close()
li_list_rs = dom_bs.find_all(name="li", attrs={"class":"b_algo"})
for li_tag in li_list_rs:
li_bs = BeautifulSoup(str(li_tag), 'lxml')
li_h2_rs = li_bs.find_all(name="h2")
ls_h2_bs = BeautifulSoup(str(li_h2_rs[0]), 'lxml')
li_h2_a_rs = ls_h2_bs.find_all(name="a")
h2_a_tag = li_h2_a_rs[0]
print(h2_a_tag.attrs["href"])
print(h2_a_tag.text)
return ""
driver_open("text")
2使用trafilatura爬取网络数据
2.1 说明
- 比较两个文本的相似度;
- 获取网页中的url地址;
- 获取html中的数据;
2.2 源代码
from trafilatura.hashing import Simhash
first = Simhash("This is a text.")
second = Simhash("This is a test.")
print(second.similarity(first))
from trafilatura.spider import focused_crawler
homepage = 'https://blog.csdn.net/make_progress?type=blog'
to_visit, known_urls = focused_crawler(homepage, max_seen_urls=2, max_known_urls=10)
print(to_visit)
print(known_urls)
import trafilatura
response = trafilatura.fetch_url('https://blog.csdn.net/make_progress?type=blog', decode=False)
print(response.status)
print(response.url)
print(response.data.decode('utf-8', 'ignore'))