忙了一个周,恰好搞中秋活动,终于是有时间水文章了,今天来点不一样的,不玩逆向,来分享一下三种语言的网络爬虫,哈哈纯属个人分享。所谓技多不压身,感兴趣的小伙伴也可以去卷一卷。
作者感觉:
相对于三种语言,目前比较成熟的适合玩爬虫的还是得python,要说性能的还是python语言比较好,同一网络下,作者测试速度最快的还是JavaScript,相对有点差异的就是GO,JavaScript虽然兼容性很强,但是不适合来搞爬虫,当然这属于作者自己的看法,我也是今天闲的有点蛋疼,哈哈。
下面就以图片加代码的形式展示一下针对同一平台各个语言的爬虫流程:
按时间排个序
第一名:JavaScript ==>0.6s
JavaScript参考代码:
var request = require('request');
const xpath = require('xpath')
const dom = require('xmldom').DOMParser
var headers = {
'Connection': 'keep-alive',
'Accept': '*/*',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'http://zjcs.yn.gov.cn',
'Referer': 'http://zjcs.yn.gov.cn/yns/cxz/bidResultNotice',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'JSESSIONID=0D9B46EC25EAEBD93FCDC314EF91A532; _gscu_802487706=54155365ki5qvy63; Hm_lvt_b9099e95d08017e30f6285a8b55eb822=1654155365; security_session_verify=197043d259f590973cb82fe37b0a88dd'
};
var dataString = 'listVo.projectName=&listVo.serviceType=&listVo.divisionCode=532300&purOrgCodePanel_selectname=&selectBox_purOrgCodePanel=&listVo.purOrgCode=&listVo.bidResultDateBegin=&listVo.bidResultDateEnd=&pageNumber=0&sourtType=&';
var options = {
url: 'http://zjcs.yn.gov.cn/yns/cxz/bidResultNotice/rest',
method: 'POST',
headers: headers,
body: dataString
};
var date1=new Date(); //开始时间
request(options, callback);//发送请求
function Trim(str,is_global)//去掉字符串前后所有空格
{
var result;
result = str.replace(/(^\s+)|(\s+$)/g,"");
if(is_global.toLowerCase()=="g") {
result = result.replace(/\s/g,"");
}
return result;
}
function callback(error, response, body) {
var html_data;
var data_list=[];
if (!error && response.statusCode == 200) {
html_data=body
}
else{
html_data=""
}
let xml = html_data;
let doc = new dom().parseFromString(xml);
var base_url= 'http://zjcs.yn.gov.cn'
var tr_list=xpath.select('//table[@class="table table-hover table-bordered table-list"]//tr',doc);
for (var tr=1;tr
第二名:GO ==> 1.6s
GO语言参考代码:
package main
import (
"fmt"
htmlquery "github.com/antchfx/xquery/html"
"io/ioutil"
"log"
"net/http"
"regexp"
"strconv"
"strings"
"time"
)
func main() {
start:=time.Now()
client := &http.Client{}
total_page:=get_total_page()
fmt.Println("总页码:",total_page)
for i:=0;i<1;i++{
fmt.Println("第"+strconv.Itoa(i+1)+"次请求")
post_request:=`listVo.projectName=&listVo.serviceType=&listVo.divisionCode=532300&purOrgCodePanel_selectname=&selectBox_purOrgCodePanel=&listVo.purOrgCode=&listVo.bidResultDateBegin=&listVo.bidResultDateEnd=&pageNumber=`+strconv.Itoa(i)+`&sourtType=&`
//fmt.Println(post_request)
var data = strings.NewReader(post_request)
req, err := http.NewRequest("POST", "http://zjcs.yn.gov.cn/yns/cxz/bidResultNotice/rest", data)
if err != nil {
log.Fatal(err)
}
req.Header.Set("X-Requested-With", "XMLHttpRequest")
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36")
req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
req.Header.Set("Origin", "http://zjcs.yn.gov.cn")
req.Header.Set("Referer", "http://zjcs.yn.gov.cn/yns/cxz/bidResultNotice")
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
resp, err := client.Do(req)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
bodyText, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
}
//fmt.Printf("%s\n", bodyText)
//解析函数
data_list:=parse(string(bodyText))
fmt.Println(data_list)
fmt.Println("总耗时:",time.Since(start))
}
}
func get_total_page() int {
client := &http.Client{}
var data = strings.NewReader(`listVo.projectName=&listVo.serviceType=&listVo.divisionCode=532300&purOrgCodePanel_selectname=&selectBox_purOrgCodePanel=&listVo.purOrgCode=&listVo.bidResultDateBegin=&listVo.bidResultDateEnd=&pageNumber=1&sourtType=&`)
req, err := http.NewRequest("POST", "http://zjcs.yn.gov.cn/yns/cxz/bidResultNotice/rest", data)
if err != nil {
log.Fatal(err)
}
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Accept", "*/*")
req.Header.Set("X-Requested-With", "XMLHttpRequest")
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36")
req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
req.Header.Set("Origin", "http://zjcs.yn.gov.cn")
req.Header.Set("Referer", "http://zjcs.yn.gov.cn/yns/cxz/bidResultNotice")
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9")
req.Header.Set("Cookie", "JSESSIONID=F1D78281958EE56DEDBC98B5AEEB5E6E; _gscu_802487706=54155365ki5qvy63; Hm_lvt_b9099e95d08017e30f6285a8b55eb822=1654155365; security_session_verify=4e95296e56b6f870f316ab54bf33e137")
resp, err := client.Do(req)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
bodyText, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatal(err)
}
//fmt.Printf("%s\n", bodyText)
total_pages, _ := regexp.Compile(`'resultPannel','(\d+)'`) // 定义匹配规则
total_page_List := total_pages.FindAllString(string(bodyText),-1) // 正则匹配
//fmt.Println(total_page_List)
rule_1,_:=regexp.Compile(`(\d+)`) // 定义匹配规则
//for a:=0;a
第三名:Python ==> 1.7s
Python参考代码:
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import time
start_time =time.time()
headers = {
"Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"Accept": "*/*",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Origin": "http://220.163.118.100",
"Referer": "http://220.163.118.100/yns/bidResultNotice",
"Accept-Language": "zh-CN,zh;q=0.9"
}
cookies = {
"JSESSIONID": "CF798AC4AC25E5BD622A2898B9C94B89"
}
url = "http://220.163.118.100/yns/bidResultNotice/rest"
data = {
"listVo.projectName": "",
"listVo.serviceType": "",
"listVo.divisionCode": "530000",
"purOrgCodePanel_selectname": "",
"selectBox_purOrgCodePanel": "",
"listVo.purOrgCode": "",
"listVo.bidResultDateBegin": "",
"listVo.bidResultDateEnd": "",
"pageNumber": "0",
"sourtType": "",
"": ""
}
response = requests.post(url, headers=headers, cookies=cookies, data=data, verify=False)
html=etree.HTML(response.text)
tr_list=html.xpath('//table[@class="table table-hover table-bordered table-list"]/tr')[1:]
data_list=[]
for tr in tr_list:
item={
"title":"".join(tr.xpath('./td[2]/a/text()')).strip(),
"href":'http://220.163.118.100'+tr.xpath('./td[2]/a/@href')[0].strip(),
"bid_time":tr.xpath('./td[6]/text()')[0].strip()
}
data_list.append(item)
print(data_list)
end_time =time.time()
print("总耗时:",end_time-start_time)
代码一看就很清楚了,代码行数,总耗时一对比,基本上就能看出那个效率高那个效率低,哈哈
作者闲的,技术交流可以加 企鹅:2695733665