代码示例
#!/usr/bin/python
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
#soup = BeautifulSoup(open('page.html'),'lxml')
soup = BeautifulSoup(html, "lxml")
# 根据标签获取标题
print soup.head.string
# 标签名为a
# class为关键字加_,样式为sister
# href包含example
# recursive=False,只查询body的直接下级
for tag in soup.body.find_all(name='a',class_='sister',href=re.compile('example'),recursive=False):
print tag.attrs['href']
#获取https://www.wxqfb.com/weixin/index/id/139.html的示例代码
data = list();
for item in soup.select('.newsllist li'):
link = item.a['href']
image = item.a.img['src']
#title = item.a.find('div',class_='title').contents[0]
title = item.a.select('div.title')[0].contents[0]
data.append({'link':link,'image':image,'title':title})
f = open('home.json','w+')
f.write(json.dumps(data))
f.close()
简介
Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间.
文档
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/