作者:iamlaosong
我前一阵子用VBA做了个工具,用XMLHTTP的Get功能抓取城市间距离。现在我想用用XMLHTTP的Post功能抓取邮件轨迹。抓取数据是用Get还是Post,取决于网站提交参数的方法。
1、通过分析(用fiddler),邮件轨迹查询网站是用post提交参数的。如下图:
上图中“Entity”内容用于设置包头,点击“TextView”可以看到传输的参数内容,邮件号码,如下图:
抓取数据的代码如下:
Sub tt()
Dim HttpReq As Object
Dim pdata, http As String
Set HttpReq = CreateObject("MSXML2.XMLHTTP.6.0")
'轨迹头部数据,网址用xxx屏蔽
http = "http://10.xxx.xxx.xxx/querypush-traln/qps/qpswaybilltraceinternal/queryTraceByTrace_no/"
pdata = "trace_no=1044905987232"
HttpReq.Open "Post", http, False
HttpReq.setRequestHeader "Content-Length", Len(pdata)
'HttpReq.setRequestHeader "CONTENT-TYPE", "application/x-www-form-urlencoded"
HttpReq.setRequestHeader "CONTENT-TYPE", "application/x-www-form-urlencoded; charset=utf-8"
HttpReq.send pdata '"trace_nos=1194359346482"
Do Until HttpReq.readyState = 4
DoEvents
Loop
If HttpReq.Status = 200 Then
Debug.Print HttpReq.responseText
End If
End Sub
2、返回内容是个json结构的数据,可以用fiddler的查看返回内容,点击“TextView”:
点击“JSON”可以看到数据解析结果:
但vba解析的时候内容为空,对比以前返回json结构的数据,发现这个数据少了个名称,因为定义JS函数时,指定了一个json结构数据的名称,即jscode = "function json(s,i) { return eval('(' + s + ').traces[' + i + ']'); }"中的traces名称。
加上后就可以解析了,即:
'返回数据要成为标准的json结构,还需要在外面加一层数据名称
buf = "{""traces"":" & HttpReq.responseText & "}"
kk = get_trace(buf)
get_trace函数就是用来解析json数据的,代码如下:
Function get_trace(mystring As String) As Integer
Dim objJSx As Object, objJSy As Object
Dim m1, m2, n, j As Integer
Dim source, level, kind, sm As String
On Error Resume Next
Set objJSx = CreateObject("ScriptControl") '调用MSScriptControl.ScriptControl对象将提取的变量文本运算形成对象集合
objJSx.Language = "JavaScript" '测试发现JavaScript、javascript、JScript都可以表示JavaScript语言
'定义一个JS函数,通过计算表达式的方式引入JSON数据并解析
jscode = "function json(s,i) { return eval('(' + s + ').traces[' + i + ']'); }"
objJSx.AddCode jscode
TT = "否"
For n = 1 To 100
If objJSx.Run("json", mystring, n - 1) = "" Then Exit For
Set objJSy = objJSx.Run("json", mystring, n - 1)
For j = 1 To 11
TraceInfo(n, j) = ""
Next j
TraceInfo(n, 1) = objJSy.traceNo
TraceInfo(n, 2) = objJSy.opCode
TraceInfo(n, 3) = objJSy.opTime
TraceInfo(n, 4) = objJSy.opName
TraceInfo(n, 6) = objJSy.opOrgCode
TraceInfo(n, 7) = objJSy.opOrgSimpleName
TraceInfo(n, 8) = objJSy.operatorNo
TraceInfo(n, 9) = objJSy.operatorName
TraceInfo(n, 10) = objJSy.level
TraceInfo(n, 11) = objJSy.source
sm = objJSy.desc
'剔除数据中的HTML部分
Do While InStr(sm, "<") > 0
m1 = InStr(sm, "<")
m2 = InStr(sm, ">")
If m2 > 0 Then
If Mid(sm, m1, 3) = "/br" Then
sm = Left(sm, m1 - 1) & " " & Right(sm, Len(sm) - m2)
Else
sm = Left(sm, m1 - 1) & Right(sm, Len(sm) - m2)
End If
Else
Exit Do
End If
Loop
TraceInfo(n, 5) = sm
If objJSy.opCode = "704" Then TT = "是"
Next n
get_trace = n - 1
End Function
3、实际使用的代码
Public Sub get_data()
'根据工作表中的查询语句读取数据
Dim HttpReq As Object
Dim i, k, kk, lineno, row1 As Long
Dim Mail, pdata, tbhead As String, buf As String
Dim arr_head
lineno = [A65536].End(xlUp).Row '行数,也是邮件号码数量
Range("B2:B" & lineno).ClearContents
'lineno = ActiveSheet.UsedRange.Rows.Count
Set HttpReq = CreateObject("MSXML2.XMLHTTP.6.0")
'轨迹头部数据
'http = "http://10.xxx.xxx.xxx/querypush-traln/qps/qpswaybilltraceinternal/queryCurrentTraceByTrace_nos/"
'pdata = "trace_nos="
'轨迹数据
http = "http://10.xxx.xxx.xxx/querypush-traln/qps/qpswaybilltraceinternal/queryTraceByTrace_no/"
'pdata = "trace_no="
row1 = 2
maxrow = Sheets("查询结果").UsedRange.Rows.Count
If maxrow >= 1 Then
Sheets("查询结果").Range("A1:L" & maxrow).ClearContents
End If
tbhead = "邮件号码 操作码 操作时间 处理动作 详细说明 机构代码 机构名称 操作员代码 操作员姓名 级别 来源"
arr_head = Split(tbhead, " ") '下标从0开始
Sheets("查询结果").Cells(1, 1).Resize(1, UBound(arr_head) + 1) = arr_head
For i = 2 To lineno
Mail = Trim(Sheets("邮件号码").Cells(i, 1))
If Mail = "" Then Exit For
If Len(Mail) = 13 Then
HttpReq.Open "Post", http, False
pdata = "trace_no=" & Mail
HttpReq.setRequestHeader "Content-Length", Len(pdata)
HttpReq.setRequestHeader "CONTENT-TYPE", "application/x-www-form-urlencoded"
HttpReq.send pdata 'pdata = "trace_no=1194359346482"
Do Until HttpReq.readyState = 4
DoEvents
Loop
'MsgBox HttpReq.getAllResponseHeaders
'Debug.Print HttpReq.responseText
'返回数据要成为标准的json结构,还需要在外面加一层数据名称
buf = "{""traces"":" & HttpReq.responseText & "}"
kk = get_trace(buf)
Sheets("邮件号码").Cells(i, 2) = TT
If kk > 0 Then
For k = kk To 1 Step -1
If CInt(TraceInfo(k, 10)) <= Range("E1") Then
For j = 1 To 11
Sheets("查询结果").Cells(row1, j) = TraceInfo(k, j)
Next j
row1 = row1 + 1
End If
Next k
Else
Sheets("邮件号码").Cells(i, 2) = "Err"
Sheets("查询结果").Cells(row1, 1) = Mail
Sheets("查询结果").Cells(row1, 2) = "Err"
Sheets("查询结果").Cells(row1, 4) = HttpReq.responseText
row1 = row1 + 1
delay (9 * Rnd + 1) '出错了,说明你还是干快了,随机后延时1-10秒,看运气了。
End If
'If CInt(i / 10) * 10 = i Then
Application.StatusBar = "完成:" & Round(i * 100 / lineno, 2) & "%"
DoEvents
'End If
delay (Rnd + 0.25) '总部领导说了,接口是大家用的,你一个人不能用太多,此处延时0.5秒,降降速度。
Else
Sheets("邮件号码").Cells(i, 2) = "异常"
End If
Next i
Application.StatusBar = "就绪"
Sheets("查询结果").Activate
msg = MsgBox("邮件批量查询完毕,共查询" & i - 2 & "个邮件!", vbOKOnly, "AHEMS:iamlaosong")
End Sub