-
1. 技术要点:
2. 过程:
采用穷举的办法(为什么用穷举,因为文章url地址是数字结尾但是又没有明显规律),开600线程,一共抓90万次,每次延时0.05秒,一共花费时间47小时。
import urllib.request
# 获取网页
response = urllib.request.urlopen(it_url)
html = response.read().decode('utf-8') # 编码格式gb2312,utf-8,GBK
html_string = str(html) # 转换成string,可以直接向数据库添加
不用正则直接获取标签中的文本:.string/.text
直接获取一个p标签:.find("p")
获取p标签数组:.find_all("p")/.find_all("div", attr={"class", "red"})/.find_all(id="ok")
获取a标签的href:a["href"]
获取图片的src:img["src"]
import requests
# GET
get_response = requests.get(url=' ')
res = get_response.text
返回评论数。
POST请求,请求后端api即可完成数据入库:
import requests
# POST
post_url = 'http://192.168.13.103/laravel55/public/index.php/it_pages2'
data = {
"it_title": it_title,
"it_url": it_url,
"it_week": it_week,
"it_number": it_number,
"it_time": it_time,
"it_source": it_source,
"it_editor": it_editor,
"it_comment": it_comment,
}
post = requests.post(url=post_url, data=data, headers={'Content-Type': 'application/x-www-form-urlencoded'})
将数据保存到数据库。
我后端用的是PHP框架laravel55,直接写一个POST提交接口即可。研究了一下Django编写有点麻烦,数据库配置很麻烦,增删改查数据很麻烦,建项目很麻烦,分布式很麻烦,总之有点麻烦。
3. 整个爬虫Python3.x代码:
from bs4 import BeautifulSoup
import urllib.request
import requests
import os
import re
import time
import _thread
# 单篇文章
def a_page(week, number, weburl):
it_url = weburl + str(week) + '/' + str(number) + '.htm' # 文章地址
print('\n文章地址:' + it_url)
try:
# 获取网页
response = urllib.request.urlopen(it_url)
html = response.read().decode('utf-8') # 编码格式gb2312,utf-8,GBK
html_string = str(html) # 转换成string,可以直接向数据库添加
soup = BeautifulSoup(html_string, "html.parser") # 解析网页标签
title_div = soup.find_all('div', attrs={'class', 'post_title'})[0]
it_title = title_div.find('h1').string # 标题
it_time = title_div.find_all(id='pubtime_baidu')[0].string # 文章时间
it_source = title_div.find_all(id='source_baidu')[0].find('a').string # 文章来源
it_editor = title_div.find_all(id='editor_baidu')[0].find('strong').string # 编辑
get_response = requests.get(url='https://dyn.ithome.com/api/comment/count?newsid=' + str(week) + str(number))
it_comment = re.findall('(\d+)', get_response.text)[-1] # 评论数
it_week = week # 第多少周
it_number = number # 文章在本周的编号
print(it_title)
print(it_url)
print(it_week)
print(it_number)
print(it_time)
print(it_source)
print(it_editor)
print(it_comment)
print('本篇文章抓取完成,编号=' + str(number))
# 文章数据存储
post_url = 'http://192.168.13.103/laravel55/public/index.php/it_pages2'
data = {
"it_title": it_title,
"it_url": it_url,
"it_week": it_week,
"it_number": it_number,
"it_time": it_time,
"it_source": it_source,
"it_editor": it_editor,
"it_comment": it_comment,
}
post = requests.post(url=post_url, data=data, headers={'Content-Type': 'application/x-www-form-urlencoded'})
print('数据保存完成,res=' + post)
pass
except:
print('不存在文章,编号=' + str(number) + ';周=' + str(week))
pass
time.sleep(0.05) # 延时抓取完成要延时12.5小时,避免拖死wifi路由器,减少样本失败率。
pass
def week_page(week, weburl):
for number in range(1, 1500): # 一周文章最大编号
print('\n第' + str(number) + '篇文章')
# a_page(week, number, weburl)
pass
pass
all_thread_num = 0
def page_class(cla, that_num):
print("已启动线程=" + str(that_num))
global all_thread_num
all_thread_num += 1
print("线程总数=" + str(all_thread_num))
for page in range(1, 30):
print("内=" + str(page))
pass
pass
for cla in range(0, 600): # 创建线程
try:
_thread.start_new_thread(week_page, (cla, 'https://www.ithome.com/0/'))
pass
except:
print("无法启动线程")
pass
pass
while 1:
pass
laravel接口代码:
public function pages(Request $request){
// return DB::table('pages')->get();
$it_title = $request->input('it_title');
$it_url = $request->input('it_url');
$it_week = $request->input('it_week');
$it_number = $request->input('it_number');
$it_time = $request->input('it_time');
$it_source = $request->input('it_source');
$it_editor = $request->input('it_editor');
$it_comment = $request->input('it_comment');
$data = [
"it_title" => $it_title,
"it_url" => $it_url,
"it_week" => $it_week,
"it_number" => $it_number,
"it_time" => $it_time,
"it_source" => $it_source,
"it_editor" => $it_editor,
"it_comment" => $it_comment,
"create_time" => date('YmdHis'),
];
$res = DB::table('pages')->insert($data);
if ($res){
$back = [
"state" => 1,
"msg" => "成功",
"content" => $data,
];
}else{
$back = [
"state" => 0,
"msg" => "失败",
"content" => $data,
];
}
echo json_encode($back, JSON_UNESCAPED_UNICODE);
}
MySQL下载:
地址1:https://download.csdn.net/download/weixin_41827162/10795556
地址2:https://makeoss.oss-cn-hangzhou.aliyuncs.com/it/it_pages2.sql
-