(1)正则爬虫
(1.1)打开网址
from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen(
"https://morvanzhou.github.io/static/scraping/basic-structure.html"
).read().decode('utf-8')
print(html)
(1.2)正则匹配
import re
res = re.findall(r"(.+?) ", html)
print("\nPage title is: ", res[0])
es = re.findall(r"(.*?)
", html, flags=re.DOTALL) # re.DOTALL if multi line
print("\nPage paragraph is: ", res[0])
res = re.findall(r'href="(.*?)"', html)
print("\nAll links: ", res)
(2)beautifulsoup
from bs4 import BeautifulSoup
from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)
soup = BeautifulSoup(html, features='lxml')
print(soup.h1)
print('\n', soup.p)
all_href = soup.find_all('a')
all_href = [l['href'] for l in all_href]
print('\n', all_href)
总结一下,网页的内容不懂,感觉里边应该是dict结构
(如果网页中有过个同样的 tag, 比如链接 , 我们可以使用
find_all()
来找到所有的选项. 因为我们真正的 link 不是在 中间
, 而是在
里面, 也可以看做是
的一个属性. 我们能用像 Python 字典的形式, 用 key 来读取
l["href"]
.)
(3)CSS
HTML 和 CSS 是一对好搭档, 他们共同组成了当今的众多网页. 如果这个世界上没有 CSS, 你看到的所有网页可能都长得像这样. 特别”骨感”!
from bs4 import BeautifulSoup
from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/list.html").read().decode('utf-8')
print(html)
...
###此部分为css .jan { background-color: yellow; } ... .month { color: red; }
...
class="month">一月
class="jan">
一月一号
一月二号
一月三号
...
(3.1)按照class匹配
soup = BeautifulSoup(html, features='lxml')
# use class to narrow search
month = soup.find_all('li', {"class": "month"})
for m in month:
print(m.get_text())
"""
一月
二月
三月
四月
五月
"""
jan = soup.find('ul', {"class": 'jan'})
d_jan = jan.find_all('li') # use jan as a parent
for d in d_jan:
print(d.get_text())
"""
一月一号
一月二号
一月三号
"""
(4)正则表达
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')})
for link in img_links:
print(link['src'])
"""
https://morvanzhou.github.io/static/img/course_cover/tf.jpg
https://morvanzhou.github.io/static/img/course_cover/rl.jpg
https://morvanzhou.github.io/static/img/course_cover/scraping.jpg
"""
course_links = soup.find_all('a', {'href': re.compile('https://morvan.*')})
for link in course_links:
print(link['href'])
"""
https://morvanzhou.github.io/
https://morvanzhou.github.io/tutorials/scraping
https://morvanzhou.github.io/tutorials/machine-learning/tensorflow/
https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
https://morvanzhou.github.io/tutorials/data-manipulation/scraping/
"""