这个其实不能算是完全意义上的网络扒虫,只是对某个社交网络进行扒取,然后得到邻接矩阵,以及相应的头像等信息。
主要的步骤:
1,扒取信息
2,正则匹配
正则表达式主要参考了:http://deerchao.net/tutorials/regex/regex.htm
扒取信息中用的是 WebClient这个方法相对HttpRequest的HttpResponse更简洁一些。
难点是克服网站的认证机制,用的是保存Cookies的方法。
扒虫部分代码:
using System;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
public class Crawler
{
public static string GetCont(string url)//扒取页面
{
string cookies = "_r01_=; depovince=BJ; p=; ap=; t=; societyguester=55a777838c4286ab5f657382dbd25c736; id=; xnsid=";
WebClient WebC = new WebClient();
WebC.Headers.Add("Cookie", cookies);
byte[] WebPa = WebC.DownloadData(url);
string PageHtml = Encoding.UTF8.GetString(WebPa);
return PageHtml;
}
public static void GetImag(string ImgUrl,string UserName)//下载小图片
{
string imageFileName;
string imageFilePath;
WebClient myClient = new WebClient();
Regex regex = new Regex("//w*");
MatchCollection UsNameMatches = regex.Matches(UserName);
imageFileName = UsNameMatches[0].Value.ToString() + ".jpg";
imageFilePath = @"D:/picture/" + imageFileName;
try
{
myClient.DownloadFile(ImgUrl, imageFilePath);
}
catch
{
}
}
}
//正则表达式部分
using System.Text;
using System.Text.RegularExpressions;
public class MyRegex
{
public static string[] GetAddr(string PageHtml)
{
string[] PageUrl=new string[24];
Regex regex = new Regex("http://www..com/profile.do//?portal=//w*&id=//d+(?=/"//stitle=)");
MatchCollection urlMatches = regex.Matches(PageHtml);
for(int i=0;i<urlMatches.Count;i++)
{
PageUrl[i]=urlMatches[i].Value.ToString();
}
return PageUrl;
}
public static string[] GetImgAddr(string PageHtml)
{
string[] ImgAddr = new string[24];
Regex regex = new Regex("(?<=stats=/"pf_friend/"//ssrc//=/").*(?=/"//swidth=/"50/"//s/>)");
MatchCollection ImgMatches = regex.Matches(PageHtml);
for(int i=0;i<ImgMatches.Count;i++)
{
ImgAddr[i] = ImgMatches[i].Value.ToString();
}
return ImgAddr;
}
public static string[] GetUsName(string PageHtml)
{
string[] UsName = new string[24];
Regex regex = new Regex("(?<=title=/"查看).*(?=的个人主页/">//W<img//sstats=/"pf_friend/")");
MatchCollection UsNameMatches = regex.Matches(PageHtml);
for (int i = 0; i < UsNameMatches.Count; i++)
{
UsName[i] = UsNameMatches[i].Value.ToString();
}
return UsName;
}
}