urlopen 用来大家并读取一个从网络获取的远程对象。
from urllib.request import u rlopen
html = urlopen("http://pythonscraping.com/pages/page1.html")
print(html.read())
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
def getTitle(url):
try:
html = urlopen(url)
except HTTPError as e:
print(e)
return None
try:
bsObj = BeautifulSoup(html.read(), "lxml")
title = bsObj.body.h1
except AttributeError as e:
print(e)
return None
return title
title = getTitle("http://pythonscraping.com/pages/page1.html")
if title == None:
print("Title could not be found!")
else:
print(title)
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
try:
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
except HTTPError as e:
print(e)
try:
bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:
print(e)
namelist = bsObj.findAll("span", {"class":"green"})
for name in namelist:
print(name.get_text())
findAll(tag, attributes, recursive, text, limit, keywords)
find(tag, attributes, recursive, text, keywords)
tag::传一个标签名称或多个标签组成的列表
attributes:传一个Python字典封装一个标签的若干属性和属性值。例如:.findAll("span", {"class":{"green", "red"}})
recursive:是一个递归参数,要求传一个布尔变量,默认值是Ture,所以findAll默认会去查找标签参数的所有子标签,以及子标签的子标签。改为False,findAll就至查找文档的以及标签。
text:用标签的文本内容去匹配,而不是标签的属性。
limit:范围限制参数,显然只用于findAll,find其实等价与findAll中的limit=1的情况。limit参数设置后,它的返回的前limit项结果是按照网页上的顺序排序的。
keyword:可以让你选择指定属性的标签,是BeautifulSoup设置的一个冗余功能,可替代,且偶尔会出现问题。例如bsObj.findAll(class="green"),会产生一个语法错误,因为class是Python的保留字。
.get_text() 会把你正在处理的 HTML 文档中所有的标签都清除,然后返回一个只包含文字的字符串。假如你正在处理一个包含许多超链接、段落和标签的大段源代码,那么 .get_text() 会把这些超链接、段落和标签都清除掉,只剩下一串不带标签的文字。
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
try:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
except HTTPError as e:
print(e)
try:
bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:
print(e)
for child in bsObj.find("table", {"id":"giftList"}).children:
print(child)
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
try:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
except HTTPError as e:
print(e)
try:
bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:
print(e)
for sibling in bsObj.find("table", {"id":"giftList"}).tr.next_siblings:
print(sibling)
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
try:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
except HTTPError as e:
print(e)
try:
bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:
print(e)
print(bsObj.find("img", {"src":"../img/gifts/img1.jpg"}).parent.previous_sibling.get_text())
邮箱:[A-Za-z0-9\._+]+@[A-Za-z]+\.(com|org|edu|net)
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import re
try:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
except HTTPError as e:
print(e)
try:
bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:
print(e)
images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for img in images:
print(img["src"])
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import re
try:
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
except HTTPError as e:
print(e)
try:
bsObj = BeautifulSoup(html, "lxml")
except AttributeError as e:
print(e)
images = bsObj.findAll("img", {"src":re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
for img in images:
print(img.attrs["src"])