目录
第一种正则表达式(re)
第二种css
第三种xpath
我分别用正则、css、xpath三种解析方式来解析从刮擦的报价 (toscrape.com)爬取了的名言。名言作者和名言的标签。
先给大家看一下这个名言网网页截图,这些名言其实是英文的,但是浏览器给我自动翻译了,所以看这是中文的,扒下来之后还是英文的。
from urllib.request import urlopen
import re
"""
. 任意字符
* 任意多个 0个也行
? 0个或1个
+ 1个或者多个
"""
url = 'https://quotes.toscrape.com/'
respose = urlopen(url)
content = respose.read().decode('utf-8')
# print(content)
#解析数据
#正则表达式
#爬取名言
last_quote = []
quotes = re.findall('(.*)', content)
# print(type(quotes), len(quotes))
for span in quotes:
#去除”
# print(span.strip("“"))
last_quote.append(span.strip("“”"))
#爬取作者
last_author = []
authors = re.findall('(.*)', content)
for author in authors:
# print(author)
last_author.append(author)
#爬取关键词
last_tag = []
#re.RegexFlag.DOTALL 可以匹配任意字符
tags = re.findall(' ', content, re.RegexFlag.DOTALL)
# print(len(tags))
# print(tags)
for tag in tags:
# print(tag)
true_tags = re.findall('(.*)', tag)
result_tag = ",".join(true_tags)
# print(result_tag)
last_tag.append(result_tag)
for i in list(range(0, len(last_tag))):
print(last_quote[i], last_author[i], last_tag[i])
from urllib.request import urlopen
from bs4 import BeautifulSoup
#第一步模拟发起请求
url = 'https://quotes.toscrape.com/'
respose = urlopen(url)
code = respose.getcode()
print(code)
#第二步 得到结果
# html_content = respose.read().decode('utf-8')
bs = BeautifulSoup(respose, 'html.parser')
#第三步 解析数据
#需要传入一个参数,就是一个选择器,就是一个css选择器
#获取名言
last_quote = []
spans = bs.select("span.text")
# print(len(spans), type(spans[0]))
for span in spans:
print(span.text.strip("“”"))
last_quote.append(span.text.strip("“”"))
last_author = []
authors = bs.select("small")
for author in authors:
print(author.text)
last_quote.append(author.text)
last_tag = []
tags = bs.select("div.tegs")
for tag in tags:
ass = tag.select("a")
last_tag.append(",".join([a.text for a in ass]))
#第四步 处理数据
for i in list(range(0, len(last_tag))):
print(last_quote[i], last_author[i], last_tag[i])
from urllib.request import urlopen
from lxml import etree
#第一步模拟发起请求
url = 'https://quotes.toscrape.com/'
respose = urlopen(url)
code = respose.getcode()
print(code)
#第二步 得到结果
html_content = respose.read().decode('utf-8')
#第三步 解析数据
#需要传入一个参数,就是一个选择器,就是一个lxml选择器
tree = etree.HTML(html_content)
last_quote = []
spans = tree.xpath('/html/body/div/div[2]/div[1]/div/span[1]')
# print(len(spans), type(spans[0]))
for span in spans:
# print(span.text.strip('“”'))
last_quote.append(span.text.strip('“”'))
last_author = []
authors = tree.xpath('/html/body/div/div[2]/div[1]/div/span/small')
# print(len(authors), type(authors[0]))
for author in authors:
# print(author.text)
last_author.append(author.text)
last_tag = []
tags = tree.xpath('/html/body/div/div/div/div/div')
# print(len(tags), type(tags[0]))
for tag in tags:
ass = tag.xpath('/font/a')
last_tag.append(','.join([a.text for a in ass]))
#第四步 处理数据
for i in list(range(0, len(last_tag))):
print(last_quote[i], last_author[i], last_tag[i])
三种方式解析数据最后得到的结果都是一样的,图片如下: