VB网抓汉字乱码问题的解决

最近在尝试学习网抓,遇到了待汉字的链接出错的问题。通过浏览器直接输入带汉字的链接是可以解析出来并返回正确页面的,而通过程序却总是返回找不到页面的错误。因而怀疑是汉字编码转码的问题。通过shell命令使用chrome可以打开,但shell IE浏览器却无法打开。使用XMLHTTP、internetexplorer、Webbrowser控件均是乱码。可以基本确定是发送的汉字没有正确转码的问题了。VB发送的字符串是UTF-8,而页面接受的最终数据是GB2312。附上转化代码。

Function gb2utf(szInput)
Dim wch, uch, szRet
Dim x
Dim nAsc, nAsc2, nAsc3

If szInput = "" Then
    unic = szInput
    Exit Function
End If
For x = 1 To Len(szInput)
    wch = Mid(szInput, x, 1)
    nAsc = AscW(wch)
    If nAsc < 0 Then nAsc = nAsc + 65536

    If (nAsc And &HFF80) = 0 Then
        szRet = szRet & wch
    Else
        If (nAsc And &HF000) = 0 Then
            uch = "%" & Hex(((nAsc \ 2 ^ 6)) Or &HC0) & Hex(nAsc And &H3F Or &H80)
            szRet = szRet & uch
        Else
            uch = "%" & Hex((nAsc \ 2 ^ 12) Or &HE0) & "%" & _
                        Hex((nAsc \ 2 ^ 6) And &H3F Or &H80) & "%" & _
                        Hex(nAsc And &H3F Or &H80)
            szRet = szRet & uch
        End If
    End If
Next
    
gb2utf = szRet
End Function


Function utf2gb(UTFStr)
For Dig = 1 To Len(UTFStr)
If Mid(UTFStr, Dig, 1) = "%" Then
If Len(UTFStr) >= Dig + 8 Then
GBStr = GBStr & ConvChinese(Mid(UTFStr, Dig, 9))
Dig = Dig + 8
Else
GBStr = GBStr & Mid(UTFStr, Dig, 1)
End If
Else
GBStr = GBStr & Mid(UTFStr, Dig, 1)
End If
Next
utf2gb = GBStr
End Function

Function ConvChinese(x)
A = Split(Mid(x, 2), "%")
i = 0
j = 0

For i = 0 To UBound(A)
A(i) = c16to2(A(i))
Next

For i = 0 To UBound(A) - 1
DigS = InStr(A(i), "0")
Unicode = ""
For j = 1 To DigS - 1
If j = 1 Then
A(i) = Right(A(i), Len(A(i)) - DigS)
Unicode = Unicode & A(i)
Else
i = i + 1
A(i) = Right(A(i), Len(A(i)) - 2)
Unicode = Unicode & A(i)
End If
Next

If Len(c2to16(Unicode)) = 4 Then
ConvChinese = ConvChinese & ChrW(Int("&H" & c2to16(Unicode)))
Else
ConvChinese = ConvChinese & Chr(Int("&H" & c2to16(Unicode)))
End If
Next
End Function

Function c2to16(x)
i = 1
For i = 1 To Len(x) Step 4
c2to16 = c2to16 & Hex(c2to10(Mid(x, i, 4)))
Next
End Function

Function c2to10(x)
c2to10 = 0
If x = "0" Then Exit Function
i = 0
For i = 0 To Len(x) - 1
If Mid(x, Len(x) - i, 1) = "1" Then c2to10 = c2to10 + 2 ^ (i)
Next
End Function

Function c16to2(x)
i = 0
For i = 1 To Len(Trim(x))
tempstr = c10to2(CInt(Int("&h" & Mid(x, i, 1))))
Do While Len(tempstr) < 4
tempstr = "0" & tempstr
Loop
c16to2 = c16to2 & tempstr
Next
End Function

Function c10to2(x)
mysign = Sgn(x)
x = Abs(x)
DigS = 1
Do
If x < 2 ^ DigS Then
Exit Do
Else
DigS = DigS + 1
End If
Loop
tempnum = x

i = 0
For i = DigS To 1 Step -1
If tempnum >= 2 ^ (i - 1) Then
tempnum = tempnum - 2 ^ (i - 1)
c10to2 = c10to2 & "1"
Else
c10to2 = c10to2 & "0"
End If
Next
If mysign = -1 Then c10to2 = "-" & c10to2
End Function

然后解决页面返回汉字乱码的问题。通过简单的strResponse = StrConv(xmlHttp.responseBody, vbUnicode)无法正确解码。于是只能手动转了,附上代码。页面是GB2312需要转换成UTF-8。

Private Sub testCode()
'测试
Debug.Print GetHtml("http://www.baidu.com/")
End Sub

Public Function GetHtml(url As String)
Dim xmlHttp As Object
Set xmlHttp = CreateObject("Microsoft.XMLHTTP")'
xmlHttp.Open "GET", url, True
'    xmlHttp.Open "POST", url, True
'    xmlHttp.setRequestHeader "Content-Type", "application/x-www-form-urlencoded;charset=utf-8"
xmlHttp.send (Null)
While xmlHttp.ReadyState <> 4
    DoEvents
Wend
GetHtml = BytesToBstr(xmlHttp.responseBody)
End Function

Public Function PostHtml(url As String, val As String)
Dim xmlHttp As Object
Set xmlHttp = CreateObject("Microsoft.XMLHTTP")
'    xmlHttp.Open "GET", url, True
xmlHttp.Open "POST", url, True
xmlHttp.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
xmlHttp.send (val)
While xmlHttp.ReadyState <> 4
    DoEvents
Wend
PostHtml = BytesToBstr(xmlHttp.responseBody)
End Function


Public Function BytesToBstr(Bytes)
Dim Unicode As String
If IsUTF8(Bytes) Then '如果不是UTF-8编码则按照GB2312来处理
    Unicode = "UTF-8"
Else
    Unicode = "GB2312"
End If

Dim objstream As Object
Set objstream = CreateObject("ADODB.Stream")
With objstream
    .Type = 1
    .Mode = 3
    .Open
    .Write Bytes
    .Position = 0
    .Type = 2
    .Charset = Unicode
    BytesToBstr = .ReadText
   .Close
End With
End Function

 '判断网页编码函数
Private Function IsUTF8(Bytes) As Boolean
    Dim i As Long, AscN As Long, Length As Long
    Length = UBound(Bytes) + 1
   
    If Length < 3 Then
        IsUTF8 = False
        Exit Function
    ElseIf Bytes(0) = &HEF And Bytes(1) = &HBB And Bytes(2) = &HBF Then
        IsUTF8 = True
        Exit Function
    End If

    Do While i <= Length - 1
        If Bytes(i) < 128 Then
            i = i + 1
            AscN = AscN + 1
        ElseIf (Bytes(i) And &HE0) = &HC0 And (Bytes(i + 1) And &HC0) = &H80 Then
            i = i + 2

        ElseIf i + 2 < Length Then
            If (Bytes(i) And &HF0) = &HE0 And (Bytes(i + 1) And &HC0) = &H80 And (Bytes(i + 2) And &HC0) = &H80 Then
                 i = i + 3
            Else
                IsUTF8 = False
                Exit Function
            End If
        Else
            IsUTF8 = False
            Exit Function
        End If
    Loop
           
    If AscN = Length Then
        IsUTF8 = False
    Else
        IsUTF8 = True
    End If
End Function

你可能感兴趣的:(VB网抓汉字乱码问题的解决)