from urllib import request
from bs4 import BeautifulSoup
url = r'https://www.jianshu.com/'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
page = request.Request(url, headers=headers)
page_info = request.urlopen(page).read().decode('utf-8')
soup = BeautifulSoup(page_info, 'html.parser')
titles = soup.find_all('a','title')
try:
# 在E盘以只写的方式打开/创建一个名为 titles 的txt文件
file = open(r'E:\titles.txt', 'w')
for tmp in titles:
# 将爬去到的文章题目写入txt中
file.write(tmp.string + '\n')
finally:
if file:
# 关闭文件(很重要)
file.close()
python中BeautifulSoup的fiind用法:find以及find_all
将以上代码保存为title.py。进入该文件存放目录,在cmd下执行python title.py。执行结束后E盘的titles.txt文件内容如下
同样,保存一下代码为img.py并执行,但是执行结果出错,错误如下:
# -*- coding:utf-8 -*-
import time
from urllib import request
from bs4 import BeautifulSoup
import re
url = r'https://www.zhihu.com/question/28594126'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
page = request.Request(url, headers=headers)
page_info = request.urlopen(page).read().decode('utf-8')
soup = BeautifulSoup(page_info, 'html.parser')
# Beautiful Soup和正则表达式结合,提取出所有图片的链接(img标签中,class=**,以.jpg结尾的链接)
links = soup.find_all('img', "origin_image zh-lightbox-thumb",src=re.compile(r'.jpg$'))
# 设置保存的路径,否则会保存到程序当前路径
local_path = r'E:\imgage'
for link in links:
print(link.attrs['src'])
# 保存链接并命名,time防止命名冲突
request.urlretrieve(link.attrs['src'], local_path+r'\%s.jpg' % time.time())
后来发现是因为time.time()方法出错,并且for循环的缩进一定要注意保持一致.改为如下之后成功保存到知乎话题: 女生健身如何保持好身材?
x=0
for link in links:
print(link.attrs['src'])
# 保存链接并命名,time防止命名冲突
request.urlretrieve(link.attrs['src'],'E:\\image\\'+'%s.jpg'%x)
x=x+1
运行结果如下: