免积分源码地址
个人比较喜欢小旭和月之门出品的游戏音乐。发现月之门不知道什么时候换了个漂亮的网站。而且也提供了音乐在线试听的页面。如图所示。
地址是 http://www.cyberweaver.net/demo.asp
查看了一下网页源代码。发现所有的音乐地址都存在于百度网盘中。于是整了一个爬虫将地址都抓取出来了。
因为很简单。没什么多解释的。就直接上代码。是C#的。
using System; using System.Collections; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; using System.Windows.Forms; using System.Xml; namespace GetUrl { public partial class Form1 : Form { string strCode; ArrayList alLinks; public Form1() { InitializeComponent(); } private void button1_Click(object sender, EventArgs e) { if (textBox1.Text == "") { MessageBox.Show("请输入网址"); return; } string strURL = textBox1.Text.ToString().Trim(); if (strURL.Substring(0, 7) != @"http://") { strURL = @"http://" + strURL; } MessageBox.Show("正在获取页面代码,请稍后..."); strCode = GetPageSource(strURL); MessageBox.Show("正在提取超链接,请稍侯..."); alLinks = GetHyperLinks(strCode); MessageBox.Show("正在写入文件,请稍侯..."); WriteToXml(strURL, alLinks); } // 获取指定网页的HTML代码 public static string GetPageSource(string URL) { Uri uri = new Uri(URL); HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri); HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse(); hwReq.Method = "Get"; hwReq.KeepAlive = false; StreamReader reader = new StreamReader(hwRes.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312")); return reader.ReadToEnd(); } // 提取HTML代码中的网址 public static ArrayList GetHyperLinks(string htmlCode) { ArrayList al = new ArrayList(); string strRegex = @"demo_class.asp?sort=([^a-zA-Z0-9])+"; strRegex = @"demo_class.asp\?sort=([^x00-xff]{4})&id=606"; strRegex = @"demo_class.asp\?sort=([^x00-xff]{4})&id\=([a-z0-9]+)"; Regex r = new Regex(strRegex, RegexOptions.IgnoreCase); MatchCollection m = r.Matches(htmlCode); for (int i = 0; i <= m.Count - 1; i++) { bool rep = false; string strNew = m[i].ToString(); // 过滤重复的URL foreach (string str in al) { if (strNew == str) { rep = true; break; } } if (!rep) al.Add(strNew); } al.Sort(); return al; } // 提取HTML代码中的网址 public static string GetHyperLinks_Song(string htmlCode) { //http://pan.baidu.com/share/link?shareid=112744&uk=455690558 string strRegex; strRegex = @"http://pan.baidu.com/share/link\?shareid=([a-z0-9]+)&uk=([a-z0-9]+)"; Regex r = new Regex(strRegex, RegexOptions.IgnoreCase); MatchCollection m = r.Matches(htmlCode); for (int i = 0; i <= m.Count - 1; i++) { bool rep = false; string strNew = m[i].ToString(); return strNew; } return null; } // 把网址写入xml文件 static void WriteToXml(string strURL, ArrayList alHyperLinks) { XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml", Encoding.UTF8); writer.Formatting = Formatting.Indented; writer.WriteStartDocument(false); writer.WriteDocType("HyperLinks", null, "urls.dtd", null); writer.WriteComment("提取自" + strURL + "的超链接"); writer.WriteStartElement("HyperLinks"); writer.WriteStartElement("HyperLinks", null); writer.WriteAttributeString("DateTime", DateTime.Now.ToString()); foreach (string str in alHyperLinks) { string title = GetDomain(str); string body = str; writer.WriteElementString(title, null, body); } writer.WriteEndElement(); writer.WriteEndElement(); writer.Flush(); writer.Close(); } static void WriteToTxt(string strURL) { string strAddrTxt = "月之门游戏音乐地址.txt"; if (File.Exists(strAddrTxt)) { StreamWriter sw = new StreamWriter(strAddrTxt, true, Encoding.Default); //该编码类型不会改变已有文件的编码类型 sw.WriteLine(strURL); sw.Close(); return ; } else { try { FileStream fs = new FileStream(strAddrTxt, FileMode.CreateNew); fs.Close(); StreamWriter sw = new StreamWriter(strAddrTxt, true, Encoding.Default); //该编码类型不会改变已有文件的编码类型 sw.WriteLine(strURL); sw.Close(); return ; } catch (Exception e) { MessageBox.Show(e.Message.ToString()); return ; } } } // 获取网址的域名后缀 static string GetDomain(string strURL) { string retVal; string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)"; Regex r = new Regex(strRegex, RegexOptions.IgnoreCase); Match m = r.Match(strURL); retVal = m.ToString(); strRegex = @"\.|/$"; retVal = Regex.Replace(retVal, strRegex, "").ToString(); if (retVal == "") retVal = "other"; return retVal; } //直接分析超链接 private void button1_Click_1(object sender, EventArgs e) { string strContent = null; ArrayList arry_Link2_DemoPage; //获取网页 if (textBox1.Text == "") { MessageBox.Show("请输入网址"); return; } string strURL = textBox1.Text.ToString().Trim(); if (strURL.Substring(0, 7) != @"http://") { strURL = @"http://" + strURL; } MessageBox.Show("正在获取页面代码,请稍后..."); strContent = GetPageSource(strURL); //分析 http://www.cyberweaver.net/demo_class.asp 的超链接 arry_Link2_DemoPage = GetHyperLinks(strContent); //下载 Demo 分页内容并分析其中的 http://pan.baidu.com/share/link?shareid=112744&uk=455690558 这种超链接 for (int i = 0; i < arry_Link2_DemoPage.Count;i++ ) { string strDemoContent = null; string strUrlDemo = "http://www.cyberweaver.net/" + arry_Link2_DemoPage[i]; strDemoContent = GetPageSource(strUrlDemo); string strSongAddr = GetHyperLinks_Song(strDemoContent); //这里获取到了文件地址写入xml WriteToTxt(strSongAddr); } } //下载网页到本地 private void button2_Click(object sender, EventArgs e) { if (textBox1.Text == "") { MessageBox.Show("请输入网址"); return; } SaveFileDialog sfg = new SaveFileDialog(); sfg.Filter = "网页文件(*.html)|*.html;*.htm"; if (sfg.ShowDialog() == DialogResult.OK) { string strURL = textBox1.Text.ToString().Trim(); if (strURL.Substring(0, 7) != @"http://") { strURL = @"http://" + strURL; } //这里也可以填入一张图片的地址然后下载该图片 WebRequest request = WebRequest.Create(strURL); WebResponse response = request.GetResponse(); Stream reader = response.GetResponseStream(); FileStream writer = new FileStream(sfg.FileName, FileMode.OpenOrCreate, FileAccess.Write); byte[] buff = new byte[512]; int c = 0; //实际读取的字节数 while ((c = reader.Read(buff, 0, buff.Length)) > 0) { writer.Write(buff, 0, c); } writer.Close(); writer.Dispose(); reader.Close(); reader.Dispose(); response.Close(); } } //分析地址 http://www.cyberweaver.net/demo.asp 中的超链接 private void button3_Click(object sender, EventArgs e) { } //测试正则表达式 private void button3_Click_1(object sender, EventArgs e) { string strContent = null; try { using (StreamReader sr = new StreamReader("月之门在线试听网页.html", Encoding.GetEncoding("gb2312"))) { String line = null; while ((line = sr.ReadLine()) != null) { strContent += line.ToString(); } } } catch (Exception e2) { Console.WriteLine(e2.Message); } ArrayList al = new ArrayList(); //string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; //<a href="demo_class.asp?sort=北京&id=605"> //先将 "" 替换为 空 strContent = Regex.Replace(strContent, "\"", ""); string strRegex = @"demo_class.asp?sort=([^a-zA-Z0-9])+"; strRegex = @"demo_class.asp\?sort=([^x00-xff]{4})&id=606"; strRegex = @"demo_class.asp\?sort=([^x00-xff]{4})&id\=([a-z0-9]+)"; Regex r = new Regex(strRegex, RegexOptions.IgnoreCase); MatchCollection m = r.Matches(strContent); for (int i = 0; i <= m.Count - 1; i++) { bool rep = false; string strNew = m[i].ToString(); // 过滤重复的URL foreach (string str in al) { if (strNew == str) { rep = true; break; } } if (!rep) al.Add(strNew); } al.Sort(); } } }
最终得到的在线试听地址如下
现在难度在于如何批量从百度网盘中下东西。我的思路是将百度网盘的下载地址转换为迅雷地址,然后通过迅雷的批量任务下载。但是百度网盘转迅雷地址怎么实现,我还没有办法。希望高手不吝赐教
witch 2013-1-9