C# .NET 爬虫抓取京东商城所有商品分类,引用HtmlAgilityPack,用Xpath解析,完整的代码,一键运行,简单易懂,上手快,实用。
第一种:打开微信,搜一搜"别打我女儿的主意"打开微信小程序,找到菜单栏,点击源码,进去就可以获得链接
第二种:可以给本文点赞、好评,然后发邮件到[email protected],如果有学习资料视频可以分享的话,可以捎带分享给我。
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Web;
namespace ReptileDemo
{
public class HttpHelper
{
private static Logger logger = new Logger(typeof(HttpHelper));
///
///下载Html
///http://tool.sufeinet.com/HttpHelper.aspx
///HttpWebRequest功能比较丰富,WebClient使用比较方便
///
///
///
public static string DownloadHtml(string url)
{
string html = string.Empty;
try
{
logger.Info($"准备下载{url}");
HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
request.Timeout = 30 * 1000;//设置30s的超时
request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3641.400 QQBrowser/10.4.3284.400";
request.ContentType = "text/html;charset=utf-8";
request.Host = "www.jd.com";
request.Headers.Add("Cookie", @"unpl=V2_ZzNtbRVeQRx0C08HfRsLA2JWRVsSUBRBcQ1DA3xJXQduAxtfclRCFX0UR1ZnGVkUZAIZXkZcQxxFCEdkeB5fA2AFEFlBZxBFLV0CFi9JH1c%2bbRVZSl5CEXILQlNLKV8FVwMTbUJRRBFzD0NQfR1sNWAzIm1HUUYWfQh2VUsYbEczXxVdSl5GFjgIQFN%2fH1sAYwUWbUNnQA%3d%3d; __jdc=122270672; __jdv=122270672|baidu-search|t_262767352_baidusearch|cpc|6598056256_0_f838028b73f74df7a6fe544f6a038183|1552566317616; areaId=2; ipLoc-djd=2-2826-0; __jdu=1765764575; PCSYCityID=2; shshshfp=19b93df9a57acfd8a2c8bc93453cacdb; shshshfpa=fe4d06e3-df43-07b4-ef34-2e150bbf4912-1552566333; shshshfpb=qcA4sOiYOPoP6A79RQBqduQ%3D%3D; user-key=a870a5c1-ff4b-4d89-839a-f5af593dbdde; cn=0; __jda=122270672.1765764575.1552566316.1552566318.1552569329.2; 3AB9D23F7A4B3C9B=QHZNLN23OX664GDGD5OQCQXB46HMP75FXD2PI3IQOZN6CYBCARXEALNA67GD42KRENN63AMA42URMUXLDYEJ63CEYM; __jdb=122270672.6.1765764575|2.1552569329; QRCodeKey=AAEAINsIf6OrwCryLWM0IMIGU61UjVwXDiuCP4X655piOkI5; wlfstk_smdl=x4nyu0p9ivxzkkgh7h0y0b6hin6lwmjt");
request.Method = "GET";
request.CookieContainer = new CookieContainer();
Encoding enc = Encoding.UTF8;
using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)
{
if (response.StatusCode != HttpStatusCode.OK)
{
logger.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
}
else
{
try
{
StreamReader sr = new StreamReader(response.GetResponseStream(), enc);
html = sr.ReadToEnd();//读取数据
sr.Close();
}
catch (Exception ex)
{
logger.Warn(string.Format($"DownloadHtml抓取{url}失败", ex));
html = null;
}
}
}
}
catch (System.Net.WebException ex)
{
if (ex.Message.Equals("远程服务器返回错误:(306)。"))
{
html = null;
}
}
catch (Exception)
{
throw;
}
return html;
}
}
}
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using Newtonsoft.Json.Linq;
namespace ReptileDemo
{
public partial class Index : System.Web.UI.Page
{
//EntitiesModels db = new EntitiesModels();
private Logger logger = new Logger(typeof(Index));
protected void Page_Load(object sender, EventArgs e)
{
string html = HttpHelper.DownloadHtml("https://www.jd.com/allSort.aspx");
//正则--substring/indexof/replace
if (!string.IsNullOrEmpty(html))
{
//引用HtmlAgilityPack,用Xpath解析
HtmlDocument document = new HtmlDocument();
document.LoadHtml(html);
{
//查询京东所有商品的一级分类(已知共三级分类)
string firstPath = "//*[@class='category-item m']";
HtmlNodeCollection nodeList = document.DocumentNode.SelectNodes(firstPath);//找多个节点
if (nodeList != null)
{
JArray jarray = new JArray();
foreach (HtmlNode node in nodeList)
{
string oneHtml = node.OuterHtml;//获取整段html
//继续xpath去找需要的类别
HtmlDocument documentChild = new HtmlDocument();
documentChild.LoadHtml(oneHtml);
//第一次定位
string onePath = "//h2/span";
HtmlNodeCollection nodeone = documentChild.DocumentNode.SelectNodes(onePath);//找多个节点
if (nodeone != null)
{
JToken jitemoneCategory = new JObject();
foreach (HtmlNode nodee in nodeone)
{
JArray jlistoneCategory = new JArray();
string oneCategory = nodee.InnerHtml;
//J_One modelOne = new J_One();
//modelOne.name = oneCategory;
//db.J_One.Add(modelOne);
//db.SaveChanges();
jitemoneCategory["oneCategory"] = nodee.InnerHtml;
//第二次定位
string twoPath = "//dl/dt/a";
HtmlNodeCollection nodeLists = documentChild.DocumentNode.SelectNodes(twoPath);//找多个节点
if (nodeLists != null)
{
foreach (HtmlNode nodes in nodeLists)
{
JToken jitemTwoCategory = new JObject();
JArray jlisttwoCategory = new JArray();
jitemTwoCategory["url"] = nodes.Attributes["href"].Value;
jitemTwoCategory["name"] = nodes.InnerHtml;
//logger.Info($"{name}:{url}");
//J_Two modelTwo = new J_Two();
//modelTwo.category = modelOne.id;
//modelTwo.url = nodes.Attributes["href"].Value;
//modelTwo.name = nodes.InnerHtml;
//db.J_Two.Add(modelTwo);
//db.SaveChanges();
jlistoneCategory.Add(jitemTwoCategory);
HtmlNode twoHtml = nodes.ParentNode.ParentNode;//获取整段html
string twosHtml = twoHtml.OuterHtml;//获取整段html
//继续xpath去找需要的类别
HtmlDocument documentChilds = new HtmlDocument();
documentChilds.LoadHtml(twosHtml);
//第三次定位
string threePath = "//dl/dd/a";
HtmlNodeCollection nodeListss = documentChilds.DocumentNode.SelectNodes(threePath);//找多个节点
if (nodeListss != null)
{
foreach (HtmlNode nodess in nodeListss)
{
JToken jitemthreeCategory = new JObject();
jitemthreeCategory["urls"] = nodess.Attributes["href"].Value;
jitemthreeCategory["names"] = nodess.InnerHtml;
//J_Three modelThree = new J_Three();
//modelThree.category = modelTwo.id;
//modelThree.url = nodess.Attributes["href"].Value;
//modelThree.name = nodess.InnerHtml;
//db.J_Three.Add(modelThree);
//db.SaveChanges();
jlisttwoCategory.Add(jitemthreeCategory);
jitemTwoCategory["ThreeCategory"] = jlisttwoCategory;
}
}
jitemoneCategory["TwoCategory"] = jlistoneCategory;
}
}
}
jarray.Add(jitemoneCategory);
}
}
string AAA = jarray.ToString();
}
}
}
}
}
}
附加:用WebClient方法
WebClient wc = new WebClient();
Byte[] pageData = wc.DownloadData("http://www.4399.com");
string pageHtml = Encoding.Default.GetString(pageData);
HtmlDocument document = new HtmlDocument();
document.LoadHtml(pageHtml);
string onePath = "//a";
HtmlNodeCollection nodeList = document.DocumentNode.SelectNodes(onePath);
List
if (nodeList != null)
{
foreach (HtmlNode node in nodeList)
{
dylist.Add(node.InnerHtml);
}
}
string currentPath = AppDomain.CurrentDomain.BaseDirectory;//得到当前目录
Directory.CreateDirectory(currentPath + @"\photos\");//在当前目录下创建photos文件夹
string currentPathPhotos = currentPath + @"\photos\";//得到photos的路径
WebClient myDownload = new WebClient();//实例化webclient类,用于下载
int i = 1; //用于图片的命名
Regex regJPG = new Regex(".jpg", RegexOptions.RightToLeft);//判断图片是不是.jpg格式
Regex regPNG = new Regex(".png", RegexOptions.RightToLeft);//判断图片是不是.png格式
foreach (string temp in dylist)//遍历获取到的图片URL,并下载和保存
{
Match mJpg = regJPG.Match(temp);
if (mJpg.Success)
{
string filePathJpg = currentPathPhotos + i + ".jpg";
try
{
myDownload.DownloadFile(temp, filePathJpg);
Console.WriteLine("下载成功");
i++;
}
catch
{
Console.WriteLine("下载失败");
}
}
else
{
Match mPng = regPNG.Match(temp);
if (mPng.Success)
{
string filePathPng = currentPathPhotos + i + ".png";
try
{
myDownload.DownloadFile(temp, filePathPng);
Console.WriteLine("下载成功");
i++;
}
catch
{
Console.WriteLine("下载失败");
}
}
else
{
string filePathgif = currentPathPhotos + i + ".gif";
try
{
myDownload.DownloadFile(temp, filePathgif);
Console.WriteLine("下载成功");
i++;
}
catch
{
Console.WriteLine("下载失败");
}
}
}
}