基础简单的抓取
1.通过BeautifulSoup来解析网页获取到所有链接
2.通过正则判断链接的完整性是否追加域名
3.写入到txt文件
import random
import re
import requests # 网络请求模块
from bs4 import BeautifulSoup
if __name__ == '__main__':
response = requests.get('https://www.jianshu.com', headers=getHeaders(), timeout=5)
print(response) # 打印请求结果的状态码
print(response.content) # 打印请求到的网页源码
soup = BeautifulSoup(response.content, 'lxml') # 将网页源码构造成BeautifulSoup对象,方便操作
tags = soup.find_all('a')
text = '' # 创建一个空字符串
for tag in tags:
title = tag.get_text().replace(" ", "").replace("\r", "").replace("\n", "")
href = tag["href"]
none = str(re.match(r'(http|https)+:\/\/w{3}((.\w+){2})', href, flags=0))
print("none:" + none)
if none == 'None':
href = "https://www.jianshu.com" + href
if title.strip():
text += title + "\n" + href + "\n"
# print("获取到的标签:" + text) # 打印a标签对象的href属性,即这个对象指向的链接地址
with open("url.txt", "w") as f:
f.write(text)
def getHeaders():
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UserAgent = random.choice(user_agent_list)
headers = {'User-Agent': UserAgent}
return headers
正则表达式的学习
(http|https)+:\/\/w{3}((.\w+){2})
(http|https)+: 以http或https开头出现1次或多次的
\/\/转义斜杠相当于匹配 //
w{3} w至少出现三次
(.\w+){2} .xxx.xxx.xxx 这样以点结尾格式至少出现过三次
正则测试网址:https://c.runoob.com/front-end/854