网络爬虫是当前编程领域的一个热点,图片、视频、小说……无所不能爬,有些公司甚至将网络爬虫当作一种商业手段来获取竞争对手的某些资料。对于喜欢二次元的程序员,写一个爬虫爬取自己喜爱的图片无疑是一件快乐的事情。
虽然当下python的写的爬虫盛行整个互联网,但是基于对vbs这个被微软抛弃语言仍然执着的喜爱,我还是用vbs做了一个简单的图片爬虫。
咳咳,话有点多了,权当是发发牢骚,不要介意。
言归正传,在这里简单地对我的代码做个阐述。简单的来说,爬虫的实现方式就是: 发送请求到目标网站===>获取页面源代码并获取存放图片的子域名的链接===>发送请求到所有子域名的链接(简称重定向)===>获取页面源代码并获取图片的链接===>最后下载链接对应的图片到指定目录。
开始之前我们先做一些准备
为了增强脚本的可视性(简称装13),我打算在命令行里运行我的vbs脚本,也就是说用"CScript.exe"当作我的宿主解释器。
首先新建一个文本文档,后缀名改成".cmd"或者".bat",粘贴下面的代码。
@echo off
color 0a
Cscript.exe %1
pause
end echo
然后保存(记得编码格式设置成ANSI),运行的时候只要把vbs放到bat文件上面运行即可。
(如过感觉刚刚的操作有些复杂可以跳过这一步,直接右键点击“在命令提示符中打开(w)”也可以)
Set http = CreateObject("Msxml2.ServerXMLHTTP")'用来发送请求和获取源代码
Set StdIn = WScript.StdIn'创建输入流对象,用于在命令行里面输入数据
Set StdOut = WScript.StdOut'创建输出流对象,用于在命令行里面输出数据
Set reg = New RegExp'实例化reg正则对象
'基于输入流定义函数
Function Input()
Input = StdIn.ReadLine
End Function
'基于输出流定义输出函数
Function Print(string, bl)
If bl = True Then
Print = StdOut.Write(string)
ElseIf bl = False Then
Print = StdOut.WriteLine(string)
End If
End Function
'第一步---爬取图片页面的链接
Function getPageLinks(i)
pageurl = url & CStr(i) & behind
Print "From page" & i & " :Start redirecting links :: " & pageurl, False
Print "", False
http.open "GET", pageurl
http.send
reg.Pattern = "(.*)html"
Set matches = reg.Execute(Replace(http.responseText, vbCrLf, vbLf))
For Each match In matches
ReDim Preserve links(index)
links(index) = Replace(Replace(match.Value, ", ""), Chr(34), "")
Print "Get the page link: " & links(index) & " :)", False
index = index + 1
Next
Set matches = Nothing
getPageLinks = links
Print "", False
Print "Get " & (index + 1) & " pagelinks at all :)", False
Print "", False
Print "", False
End Function
'第二步---重定向到图片页面,爬取所有符合条件图片的链接
Function getPicLinks(links)
reg.Pattern = "
index = 0
For j = 0 To UBound(links)
http.open "GET", links(j)
http.send
Set matches = reg.Execute(Replace(http.responseText, vbCrLf, vbLf))
For Each match In matches
ReDim Preserve piclink(index)
piclink(index) = Replace(Replace(Replace(Replace(match.Value, ", ""), Chr(34), ""), "src=", ""), " ", "")
Print "Get picture link: " & piclink(index) & " :)", False
index = index + 1
Next
Set matches = Nothing
Next
getPicLinks = piclink
Print "", False
Print "Get " & (index + 1) & " pictures at all. :)", False
Print "", False
Print "", False
End Function
原理是调用Adodb.Stream对象将请求返回的二进制图片数据写入本地文件,保存为图片的格式。
'下载爬取到的图片
Function Download(piclink)
For i = 0 To (UBound(piclink))
'msgbox piclink(i)
http.open "GET", piclink(i)
http.send
Set ado = CreateObject("Adodb.Stream")
With ado
.Type = 1
.Open
.Write http.responseBody
.SaveToFile "D:\a编程学习\抽象编程工具\project\vbs文件\vbs进阶\爬虫测试--vbs\moe.005.tv\test\pic" & (i + 1) & ".jpg", 2
.Close
End With
Set ado = Nothing
Print "Successfully download “pic" & i & "”.jpg :)", False
Next
Download = i
End Function
(原本为了锻炼英语把注释都写成了英文。为了增强可读性,大部分注释我都进行了翻译。)
'source website: "http://moe.005.tv/moeimg/tb/"
'another source website(example webpage): "http://moe.005.tv/moeimg/bz/"
'Version = 1.0
'Remake
WScript.Echo("_~测试-编码~_")
MsgBox " 测试编码,按下确定开始运行 " & vbCrLf & " 如果是乱码需要修改编码格式 ", 0, "Pictures-Spider"
'Define varlue and constant
'定义变量和常量
Dim http, reg, ado, matches, StdIn, StrOut, links(), pilink(), match, i, j, index, startT, overT
Const url = "http://moe.005.tv/moeimg/bz/list_4_"
Const behind = ".html"
Const picurl = "http://www.005.tv/uploads/allimg/"
'初始化变量
'Initializs varlue
index = 0
ind = 0
num = 0
'''装逼用的,没什么实际用途
WScript.Echo("")
WScript.Echo("|===========================================================================|")
WScript.Echo("|We strongly suggest that you should use CScript.exe to run this script. |")
WScript.Echo("|F11食用体验更佳哦~ |")
WScript.Echo("|Spider version = 1.0 |")
WScript.Echo("|===========================================================================|")
WScript.Echo("")
'''装逼到此结束
'Main()
'Create objects
'创建对象
Set http = CreateObject("Msxml2.ServerXMLHTTP")
Set StdIn = WScript.StdIn
Set StdOut = WScript.StdOut
Set reg = New RegExp
'Set the object reg's propertis
'设置reg对象的属性
reg.Multiline = True
reg.IgnoreCase = False
reg.Global = True
Print "IPress <Enter> to start running(按下<Enter>键开始爬取图片): ", True
Input()
Print "", False
startT = CDate(Hour(Now) & ":" & Minute(Now) & ":" & Second(Now))
index = Download(getPicLinks(getPageLinks(1)))
WScript.Echo("")
Print "Download " & index & " pictures at all! :)", False
'Deallocate thw object
'记得养成释放对象的好习惯哦
Set StdIn = Nothing
Set StdOut = Nothing
Set http = Nothing
Set reg = Nothing
overT = CStr(DateDiff("s", startT, CDate(Hour(Now) & ":" & Minute(Now) & ":" & Second(Now))))
WScript.Echo("Time: (总计用时<取整>) " & overT & " Seconds")
MsgBox "Finish!"
CreateObject("WScript.Shell").Run "taskkill -f -im cmd.exe"
WScript.Quit
'第一步---爬取图片页面的链接
Function getPageLinks(i)
pageurl = url & CStr(i) & behind
Print "From page" & i & " :Start redirecting links :: " & pageurl, False
Print "", False
http.open "GET", pageurl
http.send
reg.Pattern = "(.*)html"
Set matches = reg.Execute(Replace(http.responseText, vbCrLf, vbLf))
For Each match In matches
ReDim Preserve links(index)
links(index) = Replace(Replace(match.Value, ", ""), Chr(34), "")
Print "Get the page link: " & links(index) & " :)", False
index = index + 1
Next
Set matches = Nothing
getPageLinks = links
Print "", False
Print "Get " & (index + 1) & " pagelinks at all :)", False
Print "", False
Print "", False
End Function
'第二步---重定向到图片页面,爬取所有符合条件图片的链接
Function getPicLinks(links)
reg.Pattern = "
index = 0
For j = 0 To UBound(links)
http.open "GET", links(j)
http.send
Set matches = reg.Execute(Replace(http.responseText, vbCrLf, vbLf))
For Each match In matches
ReDim Preserve piclink(index)
piclink(index) = Replace(Replace(Replace(Replace(match.Value, ", ""), Chr(34), ""), "src=", ""), " ", "")
Print "Get picture link: " & piclink(index) & " :)", False
index = index + 1
Next
Set matches = Nothing
Next
getPicLinks = piclink
Print "", False
Print "Get " & (index + 1) & " pictures at all. :)", False
Print "", False
Print "", False
End Function
'下载爬取到的图片
Function Download(piclink)
For i = 0 To (UBound(piclink))
'msgbox piclink(i)
http.open "GET", piclink(i)
http.send
Set ado = CreateObject("Adodb.Stream")
With ado
.Type = 1
.Open
.Write http.responseBody
.SaveToFile "D:\a编程学习\抽象编程工具\project\vbs文件\vbs进阶\爬虫测试--vbs\moe.005.tv\test\pic" & (i + 1) & ".jpg", 2
.Close
End With
Set ado = Nothing
Print "Successfully download “pic" & i & "”.jpg :)", False
Next
Download = i
End Function
'Define InStream Function
'基于输入流定义函数
Function Input()
Input = StdIn.ReadLine
End Function
'Define OutStream Function
'基于输出流定义输出函数
Function Print(string, bl)
If bl = True Then
Print = StdOut.Write(string)
ElseIf bl = False Then
Print = StdOut.WriteLine(string)
End If
End Function
'While Running: Options: cd path
'CScript.exe spider.vbs
vbs爬虫的速度确实比python爬虫速度快很多,我有在B站上面投稿过一个视频,有兴趣的话你可以看一看。
论vbs和python的爬虫哪个更厉害
我写这篇博客的时候其实是抱着复活vbs的念头的(虽然几乎不可能实现了),即使Javascript的出现已经将vbs踢出了程序员的视线,微软也放弃了对vbs的支持,但是不管怎样我都希望这个脚本语言不要淡出大家的记忆。vbs其实还有很大的潜力来辅助日常的工作,我不希望它只会沦落到依靠用循环弹窗表白才能吸引眼球。
如果你看到这里,那么谢谢你能将这篇博文看完,希望能对你有所帮助,如果有错误的地方谢谢你能指出来。
祝:身体健康,工作顺利。谢谢你,再见!