最近自己为了提升一下技术,写了一个简单的电影链接网站。主要分三部份:
第一个:网站
点击打开链接
第二个:后台管理
点击打开链接
账号:ag 密码:test@123
第三个:抓取服务
本文重点介绍抓取服务,目前只抓取了两个电影网站的部份信息(只供技术开发使用为目的)。
现在直接上代码:
Program
using Autofac;
using Autofac.Builder;
using OA.Common.DtoModel;
using Ohye.Film.Application;
using Ohye.Film.Domain;
using Ohye.Film.Infrastructure;
using Ohye.Film.Infrastructure.EFRepositories;
using Ohye.Film.Infrastructure.EFRepositories.UnitOfWork;
using Ohye.Film.Service.Spider;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Threading.Tasks;
namespace Ohye.Film.Service
{
class Program
{
static void Main(string[] args)
{
Init();
SpireFilms();
List spiredFilmDate = new List();
List createIndexDate = new List();
while (true)
{
var date = DateTime.Now.ToString("yyyyMMdd");
var hour = DateTime.Now.Hour;
if (hour == 4 && !spiredFilmDate.Contains(date))
{
SpireFilms();
spiredFilmDate.Add(date);
}
if (hour == 6 && !createIndexDate.Contains(date))
{
System.Net.Http.HttpClient http = new System.Net.Http.HttpClient();
http.GetAsync("http://film.ohyewang.com/");
createIndexDate.Add(date);
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine($"生成首页成功:{DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss")}");
}
System.Threading.Thread.Sleep(TimeSpan.FromMinutes(10));
}
}
private static void SpireFilms()
{
List> pageList = new List>();
pageList.Add(new Tuple("http://list.iqiyi.com/www/1/2-----------2017--11-1-1-iqiyi--.html", 2017, "美国"));
pageList.Add(new Tuple("http://list.iqiyi.com/www/1/2-----------2016--11-1-1-iqiyi--.html", 2016, "美国"));
pageList.Add(new Tuple("http://list.iqiyi.com/www/1/2-----------2015--11-1-1-iqiyi--.html", 2015, "美国"));
pageList.Add(new Tuple("http://list.iqiyi.com/www/1/1-----------2017--11-1-1-iqiyi--.html", 2017, "华语"));
pageList.Add(new Tuple("http://list.iqiyi.com/www/1/1-----------2016--11-1-1-iqiyi--.html", 2016, "华语"));
pageList.Add(new Tuple("http://list.iqiyi.com/www/1/1-----------2015--11-1-1-iqiyi--.html", 2015, "华语"));
pageList.ForEach(p =>
{
ISplider _AIQIYI = new AIQIYI { Url = p.Item1, Year = p.Item2, Country = p.Item3 };
_AIQIYI.SpliderResult();
});
List> pageListMGTV = new List>();
pageListMGTV.Add(new Tuple("https://list.mgtv.com/3/a4-537193-------2835073-2-1--a1-.html?channelId=3", 0, "美国"));
pageListMGTV.Add(new Tuple("https://list.mgtv.com/3/a4-49-------2835073-2-1--a1-.html?channelId=3", 0, "华语"));
pageListMGTV.ForEach(p =>
{
ISplider _mgtv = new mgtv { Url = p.Item1, Year = p.Item2, Country = p.Item3 };
_mgtv.SpliderResult();
});
//List> pageListQQ = new List>();
//pageListQQ.Add(new Tuple("http://film.qq.com/film_all_list/allfilm.html?type=movie&sort=5", 0, "美国"));
//pageListQQ.ForEach(p =>
//{
// ISplider _qq = new qq { Url = p.Item1, Year = p.Item2, Country = p.Item3 };
// _qq.SpliderResult();
//});
}
static void Init()
{
AutoMapperConfig.RegisterMappings();
var builder = IocCenter.ContainerBuilder;
SetupResolveRules(builder);
}
static void SetupResolveRules(ContainerBuilder builder)
{
var application = Assembly.Load("Ohye.Film.Application");
builder.Register(c => CreateOAUser()).AsSelf();
builder.RegisterType().AsSelf().SingleInstance();
builder.RegisterAssemblyTypes(application)
.Where(t => t.Name.EndsWith("Service"))
.AsSelf().InstancePerDependency();
builder.RegisterType().InstancePerLifetimeScope();
builder.RegisterType().As().InstancePerDependency();
builder.RegisterGeneric(typeof(Repository<>)).As(typeof(IRepository<>)).InstancePerDependency();
}
static OAUser CreateOAUser()
{
return new OAUser
{
EmplID = "",
EmplName = "系统管理员",
DeptID = "",
DeptName = "总部",
};
}
}
}
Config
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Ohye.Film.Service
{
public class Config
{
public static string DataDir
{
get
{
return System.Configuration.ConfigurationManager.AppSettings["DataDir"];
}
}
public static string TempDir
{
get
{
string temp = $"{DataDir}FilmTemp";
if (!System.IO.Directory.Exists(temp))
{
System.IO.Directory.CreateDirectory(temp);
}
return temp;
}
}
}
}
SpireClient
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net;
using System.Net.Http;
using HtmlAgilityPack;
namespace Ohye.Film.Service.Spider
{
public class SpireClient
{
static List _spiredUrlList;
HttpClient _httpClient;
public event EventHandler Complete;
public SpireClient()
{
_spiredUrlList = new List();
_httpClient = new HttpClient();
}
public async Task GetHtml(string url)
{
return await _httpClient.GetStringAsync(url);
}
public void SpireUrl(string url)
{
if (_spiredUrlList.Contains(url)) return;
_spiredUrlList.Add(url);
_httpClient.GetAsync(url).ContinueWith((r) =>
{
HttpResponseMessage response = r.Result;
response.Content.ReadAsStringAsync().ContinueWith((t) =>
{
OnGetResult(this , t.Result);
});
});
}
private void OnGetResult(object sender, string e)
{
Complete?.Invoke(sender, e);
}
public List SelectNodes(string content, string regex)
{
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(content);
var htmlNodes = htmlDoc.DocumentNode.SelectNodes(regex);
if (htmlNodes == null) return new List();
return htmlNodes.ToList();
}
}
}
HttpImage
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net.Http;
using System.IO;
using OA.Infrastructure;
namespace Ohye.Film.Service.Spider
{
public class HttpImage
{
public string GetImg(string url)
{
byte[][] images = DownloadPicAsync(new string[] { url }).Result;//多线程下载图片,充分利用CPU多核
string imageName = url.Substring(url.LastIndexOf('/') + 1, url.Length - url.LastIndexOf('/') - 1);
string filePath = $@"{ Config.TempDir}\{imageName}";
using (FileStream stream = new FileStream(filePath, FileMode.OpenOrCreate))
{
byte[] buff = images[0];
stream.Write(buff, 0, buff.Length);
Console.WriteLine("成功下载图片:" + imageName);
}
string fileID = MongoContext.Mongo.SaveFile(filePath);
File.Delete(filePath);
return fileID;
}
///
/// 批量下载图片
///
///
///
public async Task DownloadPicAsync(IEnumerable urls)
{
HttpClient httpClient = new HttpClient();
Task[] downloadTask = urls.Select(r => httpClient.GetByteArrayAsync(r)).ToArray();
byte[][] data = await Task.WhenAll(downloadTask);
return data;
}
}
}
AIQIYI
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Ohye.Film.DTO.Film;
using Ohye.Film.Application.Film;
using Autofac;
using Ohye.Film.Infrastructure.Enums;
using Ohye.Film.Infrastructure;
namespace Ohye.Film.Service.Spider
{
public class AIQIYI : ISplider
{
public AIQIYI()
{
}
public string Url { get; set; }
public int Year { get; set; }
public string Country { get; set; }
public void SpliderResult()
{
SpireClient spireClient = new SpireClient();
spireClient.Complete += SpireClient_Complete;
spireClient.SpireUrl(Url);
}
private void SpireClient_Complete(object sender, string html)
{
SpireClient client = (SpireClient)sender;
var productNodes = client.SelectNodes(html, "//ul[contains(@class,'site-piclist')]/li");
productNodes.ForEach(p =>
{
var linkPic = client.SelectNodes(p.InnerHtml, "//div[@class='site-piclist_pic']/a").SingleOrDefault();
bool canRead = !client.SelectNodes(linkPic.InnerHtml, "//p[@class='viedo_lt ']").Any();
var productName = linkPic.Attributes.Where(x => x.Name == "title").SingleOrDefault().Value.Trim();
var detailURL = linkPic.Attributes.Where(x => x.Name == "href").SingleOrDefault().Value.Trim();
var detailHtml = client.GetHtml(detailURL).Result;
var introduction = client.SelectNodes(detailHtml, "//span[@id='data-videoInfoDes']").SingleOrDefault()?.InnerText.Trim();
var imgUrl = client.SelectNodes(linkPic.InnerHtml, "//img").SingleOrDefault().Attributes.Where(x => x.Name == "src").SingleOrDefault().Value.Trim();
var duration = client.SelectNodes(linkPic.InnerHtml, "//span[@class='icon-vInfo']").SingleOrDefault().InnerText.Trim();
var linkInfo = client.SelectNodes(p.InnerHtml, "//div[@class='site-piclist_info']").SingleOrDefault();
var score = client.SelectNodes(linkInfo.InnerHtml, "//span[@class='score']").SingleOrDefault().InnerText.Trim();
var authors = client.SelectNodes(linkInfo.InnerHtml, "//div[@class='role_info']/em/a").SelectMany(x => x.Attributes).Where(x => x.Name == "title").Select(x => x.Value).ToList();
try
{
HttpImage httpImage = new HttpImage();
IocCenter.Resolve(_productService =>
{
if (!_productService.CheckExisted(productName))
{
TimeSpan dur;
TimeSpan.TryParse(duration, out dur);
FM_ProductDTO product = new FM_ProductDTO
{
ID = Guid.NewGuid(),
Name = productName,
CategoryID = Guid.Parse("d012fcc6-b25a-447c-b079-95cc293a3f92"),
Year = Year,
Score = decimal.Parse(score),
Duration = dur,
CanRead = canRead,
ImageID = null,
IsDeleted = false,
Country = Country,
Content = new FM_ContentDTO
{
ID = Guid.NewGuid(),
Introduction = introduction,
ReadCount = 0,
DownLoadCount = 0
},
LinkList = !canRead ? new List() : new List
{
new FM_LinkDTO
{
ID=Guid.NewGuid(),
Address=detailURL,
AuditStatus= AuditStatus.AuditPass,
AuditTime=DateTime.Now,
LinkType=LinkType.PlayUrl,
}
},
AuthorList = authors.Select(x => new FM_AuthorDTO
{
ID = Guid.NewGuid(),
AuhorType = AuhorType.Main,
Name = x
}
).ToList()
};
product.ImageID = httpImage.GetImg(imgUrl);
_productService.Add(product);
Console.ForegroundColor = ConsoleColor.DarkGreen;
Console.WriteLine(productName);
Console.ForegroundColor = ConsoleColor.Gray;
}
else if (canRead)
{
var productInfo = _productService.CheckCanRead(productName);
if (!productInfo.Item1)
{
Console.ForegroundColor = ConsoleColor.Green;
Console.WriteLine($"发现新可播放电影:{productName}");
//重新更新
_productService.UpdateLink(productInfo.Item2, new List{
new FM_LinkDTO
{
ID = Guid.NewGuid(),
Address = detailURL,
AuditStatus = AuditStatus.AuditPass,
AuditTime = DateTime.Now,
LinkType = LinkType.PlayUrl
}
});
}
}
else
{
Console.WriteLine($"已存在:{productName}");
}
});
}
catch (Exception ex)
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine(productName + ex.Message + ex.InnerException);
Console.WriteLine("failed");
}
});
//查找下一页
var cc = client.SelectNodes(html, "//div[@class='mod-page']/a[@data-search-page='item']").ToList();
var pagesNodes = client.SelectNodes(html, "//div[@class='mod-page']/a[@data-search-page='item']").ToList().Where(p => p.Attributes["data-key"].Value != "down" && p.Attributes["data-key"].Value != "up").Select(p => new Tuple(Int32.Parse(p.Attributes["data-key"].Value), p.Attributes["href"].Value));
var currentPage = client.SelectNodes(html, "//div[@class='mod-page']/span[@class='curPage']").SingleOrDefault();
if (currentPage != null)
{
var pageIndex = Int32.Parse(currentPage.InnerText);
var nextPageIndex = pageIndex + 1;
pagesNodes.ToList().ForEach(x =>
{
if (x.Item1 == nextPageIndex)
{
Url = $"http://list.iqiyi.com/{x.Item2}";
SpliderResult();
}
});
}
}
}
}
mgtv
using Ohye.Film.Application.Film;
using Ohye.Film.DTO.Film;
using Ohye.Film.Infrastructure;
using Ohye.Film.Infrastructure.Enums;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Ohye.Film.Service.Spider
{
public class mgtv : ISplider
{
public mgtv()
{
}
public string Url { get; set; }
public int Year { get; set; }
public string Country { get; set; }
public void SpliderResult()
{
SpireClient spireClient = new SpireClient();
spireClient.Complete += SpireClient_Complete;
spireClient.SpireUrl(Url);
}
private void SpireClient_Complete(object sender, string html)
{
SpireClient client = (SpireClient)sender;
var productNodes = client.SelectNodes(html, "//ul/li[contains(@class,'m-result-list-item')]");
productNodes.ForEach(p =>
{
var linkPic = client.SelectNodes(p.InnerHtml, "//a[contains(@class,'u-video u-video-y')]").SingleOrDefault();
bool canRead = !client.SelectNodes(linkPic.InnerHtml, "//i[@class='mark-v']").Any();
var productName = client.SelectNodes(p.InnerHtml, "//a[@class='u-title']").FirstOrDefault().InnerHtml.Trim();
var detailURL = linkPic.Attributes.Where(x => x.Name == "href").SingleOrDefault().Value.Trim();
detailURL = $"https://{detailURL.Substring(2)}";
var detailHtml = client.GetHtml(detailURL).Result;
var introduction = client.SelectNodes(detailHtml, "//p[@class='u-meta-intro']/span[@class='details']").FirstOrDefault()?.InnerText.Trim();
var imgUrl = client.SelectNodes(linkPic.InnerHtml, "//img[@class='u-pic']").SingleOrDefault().Attributes.Where(x => x.Name == "src").SingleOrDefault().Value.Trim();
imgUrl = $"https://{imgUrl.Substring(2)}";
var duration = "";
var score = client.SelectNodes(linkPic.InnerHtml, "//em[@class='u-meta']").SingleOrDefault().InnerText.Trim();
var authors = client.SelectNodes(p.InnerHtml, "//span[@class='u-desc']/a").SelectMany(x => x.Attributes).Where(x => x.Name == "title").Select(x => x.Value).ToList();
try
{
HttpImage httpImage = new HttpImage();
IocCenter.Resolve(_productService =>
{
if (!_productService.CheckExisted(productName))
{
TimeSpan dur;
TimeSpan.TryParse(duration, out dur);
FM_ProductDTO product = new FM_ProductDTO
{
ID = Guid.NewGuid(),
Name = productName,
CategoryID = Guid.Parse("d012fcc6-b25a-447c-b079-95cc293a3f92"),
Year = Year,
Score = decimal.Parse(score == "" ? "0" : score),
Duration = dur,
CanRead = canRead,
ImageID = null,
IsDeleted = false,
Country = Country,
Content = new FM_ContentDTO
{
ID = Guid.NewGuid(),
Introduction = introduction,
ReadCount = 0,
DownLoadCount = 0
},
LinkList = !canRead ? new List() : new List
{
new FM_LinkDTO
{
ID=Guid.NewGuid(),
Address=detailURL,
AuditStatus= AuditStatus.AuditPass,
AuditTime=DateTime.Now,
LinkType=LinkType.PlayUrl,
}
},
AuthorList = authors.Select(x => new FM_AuthorDTO
{
ID = Guid.NewGuid(),
AuhorType = AuhorType.Main,
Name = x
}
).ToList()
};
product.ImageID = httpImage.GetImg(imgUrl);
_productService.Add(product);
Console.ForegroundColor = ConsoleColor.DarkGreen;
Console.WriteLine(productName);
Console.ForegroundColor = ConsoleColor.Gray;
}
else if (canRead)
{
var productInfo = _productService.CheckCanRead(productName);
if (!productInfo.Item1)
{
Console.ForegroundColor = ConsoleColor.Green;
Console.WriteLine($"发现新可播放电影:{productName}");
//重新更新
_productService.UpdateLink(productInfo.Item2, new List{
new FM_LinkDTO
{
ID = Guid.NewGuid(),
Address = detailURL,
AuditStatus = AuditStatus.AuditPass,
AuditTime = DateTime.Now,
LinkType = LinkType.PlayUrl
}
});
}
}
else
{
Console.WriteLine($"已存在:{productName}");
}
});
}
catch (Exception ex)
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine(productName + ex.Message + ex.InnerException);
Console.WriteLine("failed");
}
});
//查找下一页
var pages = client.SelectNodes(html, "//div[contains(@class,'w-pages w-pages-default')]/ul/li/a").ToList();
var pagesNodes = pages.Where(p => 1 == 1&& p.Attributes["href"]!=null&&p.InnerText != "..."&&p.InnerText!= " ").Select(p => new Tuple(Int32.Parse(p.InnerText), p.Attributes["href"].Value)).ToList();
var currentPage = pages.Where(p => p.Attributes["class"] != null && p.Attributes["class"].Value == "current").SingleOrDefault();
if (currentPage != null)
{
var pageIndex = Int32.Parse(currentPage.InnerText);
var nextPageIndex = pageIndex + 1;
pagesNodes.ToList().ForEach(x =>
{
if (x.Item1 == nextPageIndex)
{
Url = $"https://list.mgtv.com/{x.Item2}";
SpliderResult();
}
});
}
}
}
}
using Ohye.Film.Application.Film;
using Ohye.Film.DTO.Film;
using Ohye.Film.Infrastructure;
using Ohye.Film.Infrastructure.Enums;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Ohye.Film.Service.Spider
{
public class qq : ISplider
{
public qq()
{
}
public string Url { get; set; }
public int Year { get; set; }
public string Country { get; set; }
public void SpliderResult()
{
SpireClient spireClient = new SpireClient();
spireClient.Complete += SpireClient_Complete;
spireClient.SpireUrl(Url);
}
private void SpireClient_Complete(object sender, string html)
{
SpireClient client = (SpireClient)sender;
var productNodes = client.SelectNodes(html, "//ul[@class='figures_list']/li");
productNodes.ForEach(p =>
{
var linkPic = client.SelectNodes(p.InnerHtml, "//a").SingleOrDefault();
bool canRead = !client.SelectNodes(linkPic.InnerHtml, "//i[@class='mark_v ']").Any();
var productName = linkPic.Attributes.Where(x => x.Name == "title").SingleOrDefault().Value.Trim();
var detailURL = linkPic.Attributes.Where(x => x.Name == "href").SingleOrDefault().Value.Trim();
var detailHtml = client.GetHtml(detailURL).Result;
var introduction = client.SelectNodes(detailHtml, "//span[@id='data-videoInfoDes']").SingleOrDefault()?.InnerText.Trim();
var imgUrl = client.SelectNodes(linkPic.InnerHtml, "//img").SingleOrDefault().Attributes.Where(x => x.Name == "src").SingleOrDefault().Value.Trim();
var duration = ""; //client.SelectNodes(linkPic.InnerHtml, "//span[@class='icon-vInfo']").SingleOrDefault().InnerText.Trim();
var linkInfo = client.SelectNodes(p.InnerHtml, "//div[@class='figure_title_score']").SingleOrDefault();
var score = client.SelectNodes(linkInfo.InnerHtml, "//div[@class='figure_score']/em[@class='score_l']").SingleOrDefault().InnerText.Trim()+"."+ client.SelectNodes(p.InnerHtml, "//div[@class='figure_score']/em[@class='score_2']").SingleOrDefault().InnerText.Trim();
var authors = new List();//client.SelectNodes(linkInfo.InnerHtml, "//div[@class='role_info']/em/a").SelectMany(x => x.Attributes).Where(x => x.Name == "title").Select(x => x.Value).ToList();
try
{
HttpImage httpImage = new HttpImage();
IocCenter.Resolve(_productService =>
{
if (!_productService.CheckExisted(productName))
{
TimeSpan dur;
TimeSpan.TryParse(duration, out dur);
FM_ProductDTO product = new FM_ProductDTO
{
ID = Guid.NewGuid(),
Name = productName,
CategoryID = Guid.Parse("d012fcc6-b25a-447c-b079-95cc293a3f92"),
Year = Year,
Score = decimal.Parse(score),
Duration = dur,
CanRead = canRead,
ImageID = null,
IsDeleted = false,
Country = Country,
Content = new FM_ContentDTO
{
ID = Guid.NewGuid(),
Introduction = introduction,
ReadCount = 0,
DownLoadCount = 0
},
LinkList = !canRead ? new List() : new List
{
new FM_LinkDTO
{
ID=Guid.NewGuid(),
Address=detailURL,
AuditStatus= AuditStatus.AuditPass,
AuditTime=DateTime.Now,
LinkType=LinkType.PlayUrl,
}
},
AuthorList = authors.Select(x => new FM_AuthorDTO
{
ID = Guid.NewGuid(),
AuhorType = AuhorType.Main,
Name = x
}
).ToList()
};
product.ImageID = httpImage.GetImg(imgUrl);
_productService.Add(product);
Console.ForegroundColor = ConsoleColor.DarkGreen;
Console.WriteLine(productName);
Console.ForegroundColor = ConsoleColor.Gray;
}
else if (canRead)
{
var productInfo = _productService.CheckCanRead(productName);
if (!productInfo.Item1)
{
Console.ForegroundColor = ConsoleColor.Green;
Console.WriteLine($"发现新可播放电影:{productName}");
//重新更新
_productService.UpdateLink(productInfo.Item2, new List{
new FM_LinkDTO
{
ID = Guid.NewGuid(),
Address = detailURL,
AuditStatus = AuditStatus.AuditPass,
AuditTime = DateTime.Now,
LinkType = LinkType.PlayUrl
}
});
}
}
else
{
Console.WriteLine($"已存在:{productName}");
}
});
}
catch (Exception ex)
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine(productName + ex.Message + ex.InnerException);
Console.WriteLine("failed");
}
});
//查找下一页
var cc = client.SelectNodes(html, "//div[@class='mod-page']/a[@data-search-page='item']").ToList();
var pagesNodes = client.SelectNodes(html, "//div[@class='mod-page']/a[@data-search-page='item']").ToList().Where(p => p.Attributes["data-key"].Value != "down" && p.Attributes["data-key"].Value != "up").Select(p => new Tuple(Int32.Parse(p.Attributes["data-key"].Value), p.Attributes["href"].Value));
var currentPage = client.SelectNodes(html, "//div[@class='mod-page']/span[@class='curPage']").SingleOrDefault();
if (currentPage != null)
{
var pageIndex = Int32.Parse(currentPage.InnerText);
var nextPageIndex = pageIndex + 1;
pagesNodes.ToList().ForEach(x =>
{
if (x.Item1 == nextPageIndex)
{
Url = $"http://list.iqiyi.com/{x.Item2}";
SpliderResult();
}
});
}
}
}
}
ISplider
using Ohye.Film.DTO.Film;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Ohye.Film.Service.Spider
{
public interface ISplider
{
///
/// URL
///
string Url { get; set; }
void SpliderResult();
}
}
后续....
感兴趣的可以加入下面群