用java抓取百度Top500歌曲及源码下载
在我整理完在线听歌( http://www.5a520.cn )的模块后,剩下来的工作就是如何通过程序抓取最新的Baidu好听的歌曲,抓取的工作主要包括3个属性:歌名、歌曲在线播放地址和歌词内容(符合LRC歌词格式),目前完成歌曲和歌曲地址抓取,由于百度的歌曲地址很多通过js获取,所以歌曲地址获取我这里使用搜狗音乐搜索方便些,所有的源码如下:
/** */
/**
http://www.bt285.cn http://www.5a520.cn
*/
package com.common.utils;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.common.doc.FileOperUtils;
class Song {
private String name;
private String url;
private String lrc;
public Song(String name,String url){
this.name = name;
this.url = url;
this.lrc = "";
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getLrc() {
return lrc;
}
public void setLrc(String lrc) {
this.lrc = lrc;
}
}
public class BaiduMp3 {
public static String visitURL(String strUrl) {
URL url = null;
try {
url = new URL(strUrl);
} catch (MalformedURLException e) {
e.printStackTrace();
}
URLConnection conn = null;
try {
conn = url.openConnection();
conn.setDoOutput(true);
} catch (IOException e) {
System.out.println("e:"+e.getMessage());
}
OutputStreamWriter out;
try {
out = new OutputStreamWriter(conn.getOutputStream(), "GBK");
out.flush();
out.close();
} catch (UnsupportedEncodingException e2) {
e2.printStackTrace();
} catch (IOException e2) {
e2.printStackTrace();
}
// 接收返回信息
BufferedReader rd = null;
try {
rd = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
return rd.readLine();
} catch (IOException e1) {
e1.printStackTrace();
}
return "";
}
/** *//**
* 功能说明:访问指定的URL并检查返回结果。
* @param strUrl
* @param successFlag 请求成功的标识,比如包含“_SUCCESS”字。
* @return
*/
public static String visitURL(String strUrl, String successFlag) {
boolean rs = false;
HttpURLConnection jconn = null;
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try {
URL url = new URL(strUrl);
jconn = (HttpURLConnection) url.openConnection();
jconn.setDoOutput(true);
jconn.setDoInput(true);
jconn.connect();
InputStream in = jconn.getInputStream();
byte[] buf = new byte[4096];
int bytesRead;
while ((bytesRead = in.read(buf)) != -1) {
byteArrayOutputStream.write(buf, 0, bytesRead);
}
String strRead = new String(byteArrayOutputStream.toByteArray(),"GBK");
return strRead;
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
jconn.disconnect();
try {
byteArrayOutputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return "";
}
private static boolean isTrimEmptyOrBlank(String astr) {
if ((null == astr) || (astr.length() == 0) || " ".equals(astr)) {
return true;
}
astr = astr.trim();
if ((null == astr) || (astr.length() == 0)) {
return true;
}
return false;
}
private static String getFilteredContent(String htmlContent, String reg,int i) {
String content = "";
int k=1;
Pattern pp = Pattern.compile(reg, Pattern.DOTALL);
Matcher m = pp.matcher(htmlContent);
while (m.find()) {
content = m.group();
if(k++==i)
break;
}
return content;
}
public static List<Song> getBaiduSongs(){
List<Song> ss = new ArrayList();
String htmlContent = visitURL("http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2","s");
String encode = "GBK";
// System.out.println("===========================================================================");
// System.out.println(htmlContent);
// System.out.println("===========================================================================");
String reg = "<table width=\"100%\" align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"list\">(.*?)</table>";
htmlContent = getFilteredContent(htmlContent,reg,0);
//FileOperUtils.writeFile("c:\\1.html", htmlContent, false);
String line = "",lineurl="";
Node anode = null;
TextNode textnode = null;
try {
Parser parser = Parser.createParser(htmlContent, encode);
NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter });
NodeList nodeList = parser.parse(lastFilter);
Node[] nodes = nodeList.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
anode = (Node) nodes[i];
if(anode instanceof LinkTag){
LinkTag txt = (LinkTag)anode;
line = txt.getLinkText();
if(txt.getPreviousSibling()!=null){
if(txt.getPreviousSibling().toString().indexOf("(")>=0)
continue;
}
line = txt.getLinkText();
lineurl = txt.getAttribute("href");
//System.out.println(txt.getLink());
}
if (isTrimEmptyOrBlank(line)||isTrimEmptyOrBlank(lineurl))
continue;
ss.add(new Song(line,getSongURL(line)));
}
} catch (ParserException pe) {
pe.printStackTrace();
}
return ss;
}
private static String getSongURL(String songname){
try {
String ss = URLEncoder.encode(songname,"GBK");
String htmlContent = visitURL("http://so.mp3.qihoo.com/?type=0&src=s&kw="+ss,"s");
String encode = "GBK";
http://www.feng123.com
String reg = "<table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\">(.*?)</table>"; http://www.5a520.cn
htmlContent = getFilteredContent(htmlContent,reg,1);
String line = "",lineurl="";
Node anode = null;
TextNode textnode = null;
Parser parser = Parser.createParser(htmlContent, encode);
NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter });
NodeList nodeList = parser.parse(lastFilter);
Node[] nodes = nodeList.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
anode = (Node) nodes[i];
if(anode instanceof LinkTag){
LinkTag txt = (LinkTag)anode;
line = txt.getLinkText();
lineurl = txt.getAttribute("href");
if(!isTrimEmptyOrBlank(lineurl) && lineurl.startsWith("down.html")){
String s = getFilteredContent(lineurl,"u=(.*?)\\&",0);
if(!s.equals("")&&s.length()>5){
s = Utils.replace(s, "u=", "");
s = Utils.replace(s, "&", "");
s = URLDecoder.decode(s,"GBK");
return s;
}
}
}
}
} catch (Exception pe) {
pe.printStackTrace();
}
return "";
}
public static void main(String[] args) throws Exception{
List<Song> ss = getBaiduSongs();
int idx = 0;
for(Song s:ss){
System.out.println((++idx)+":"+s.getName()+"->"+s.getUrl());
}
// String ss = getSongURL("国家");
// System.out.println(ss);
// String s = URLDecoder.decode("http%3A%2F%2F http://www.5a520.cn %2F%B9%FA%BC%D2.mp3","GBK");
// System.out.println(s);
}
}
http://www.bt285.cn http://www.5a520.cn
*/
package com.common.utils;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.common.doc.FileOperUtils;
class Song {
private String name;
private String url;
private String lrc;
public Song(String name,String url){
this.name = name;
this.url = url;
this.lrc = "";
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getLrc() {
return lrc;
}
public void setLrc(String lrc) {
this.lrc = lrc;
}
}
public class BaiduMp3 {
public static String visitURL(String strUrl) {
URL url = null;
try {
url = new URL(strUrl);
} catch (MalformedURLException e) {
e.printStackTrace();
}
URLConnection conn = null;
try {
conn = url.openConnection();
conn.setDoOutput(true);
} catch (IOException e) {
System.out.println("e:"+e.getMessage());
}
OutputStreamWriter out;
try {
out = new OutputStreamWriter(conn.getOutputStream(), "GBK");
out.flush();
out.close();
} catch (UnsupportedEncodingException e2) {
e2.printStackTrace();
} catch (IOException e2) {
e2.printStackTrace();
}
// 接收返回信息
BufferedReader rd = null;
try {
rd = new BufferedReader(
new InputStreamReader(conn.getInputStream()));
return rd.readLine();
} catch (IOException e1) {
e1.printStackTrace();
}
return "";
}
/** *//**
* 功能说明:访问指定的URL并检查返回结果。
* @param strUrl
* @param successFlag 请求成功的标识,比如包含“_SUCCESS”字。
* @return
*/
public static String visitURL(String strUrl, String successFlag) {
boolean rs = false;
HttpURLConnection jconn = null;
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try {
URL url = new URL(strUrl);
jconn = (HttpURLConnection) url.openConnection();
jconn.setDoOutput(true);
jconn.setDoInput(true);
jconn.connect();
InputStream in = jconn.getInputStream();
byte[] buf = new byte[4096];
int bytesRead;
while ((bytesRead = in.read(buf)) != -1) {
byteArrayOutputStream.write(buf, 0, bytesRead);
}
String strRead = new String(byteArrayOutputStream.toByteArray(),"GBK");
return strRead;
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
jconn.disconnect();
try {
byteArrayOutputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return "";
}
private static boolean isTrimEmptyOrBlank(String astr) {
if ((null == astr) || (astr.length() == 0) || " ".equals(astr)) {
return true;
}
astr = astr.trim();
if ((null == astr) || (astr.length() == 0)) {
return true;
}
return false;
}
private static String getFilteredContent(String htmlContent, String reg,int i) {
String content = "";
int k=1;
Pattern pp = Pattern.compile(reg, Pattern.DOTALL);
Matcher m = pp.matcher(htmlContent);
while (m.find()) {
content = m.group();
if(k++==i)
break;
}
return content;
}
public static List<Song> getBaiduSongs(){
List<Song> ss = new ArrayList();
String htmlContent = visitURL("http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2","s");
String encode = "GBK";
// System.out.println("===========================================================================");
// System.out.println(htmlContent);
// System.out.println("===========================================================================");
String reg = "<table width=\"100%\" align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"list\">(.*?)</table>";
htmlContent = getFilteredContent(htmlContent,reg,0);
//FileOperUtils.writeFile("c:\\1.html", htmlContent, false);
String line = "",lineurl="";
Node anode = null;
TextNode textnode = null;
try {
Parser parser = Parser.createParser(htmlContent, encode);
NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter });
NodeList nodeList = parser.parse(lastFilter);
Node[] nodes = nodeList.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
anode = (Node) nodes[i];
if(anode instanceof LinkTag){
LinkTag txt = (LinkTag)anode;
line = txt.getLinkText();
if(txt.getPreviousSibling()!=null){
if(txt.getPreviousSibling().toString().indexOf("(")>=0)
continue;
}
line = txt.getLinkText();
lineurl = txt.getAttribute("href");
//System.out.println(txt.getLink());
}
if (isTrimEmptyOrBlank(line)||isTrimEmptyOrBlank(lineurl))
continue;
ss.add(new Song(line,getSongURL(line)));
}
} catch (ParserException pe) {
pe.printStackTrace();
}
return ss;
}
private static String getSongURL(String songname){
try {
String ss = URLEncoder.encode(songname,"GBK");
String htmlContent = visitURL("http://so.mp3.qihoo.com/?type=0&src=s&kw="+ss,"s");
String encode = "GBK";
http://www.feng123.com
String reg = "<table width=\"100%\" border=\"0\" cellspacing=\"0\" cellpadding=\"0\">(.*?)</table>"; http://www.5a520.cn
htmlContent = getFilteredContent(htmlContent,reg,1);
String line = "",lineurl="";
Node anode = null;
TextNode textnode = null;
Parser parser = Parser.createParser(htmlContent, encode);
NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter });
NodeList nodeList = parser.parse(lastFilter);
Node[] nodes = nodeList.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
anode = (Node) nodes[i];
if(anode instanceof LinkTag){
LinkTag txt = (LinkTag)anode;
line = txt.getLinkText();
lineurl = txt.getAttribute("href");
if(!isTrimEmptyOrBlank(lineurl) && lineurl.startsWith("down.html")){
String s = getFilteredContent(lineurl,"u=(.*?)\\&",0);
if(!s.equals("")&&s.length()>5){
s = Utils.replace(s, "u=", "");
s = Utils.replace(s, "&", "");
s = URLDecoder.decode(s,"GBK");
return s;
}
}
}
}
} catch (Exception pe) {
pe.printStackTrace();
}
return "";
}
public static void main(String[] args) throws Exception{
List<Song> ss = getBaiduSongs();
int idx = 0;
for(Song s:ss){
System.out.println((++idx)+":"+s.getName()+"->"+s.getUrl());
}
// String ss = getSongURL("国家");
// System.out.println(ss);
// String s = URLDecoder.decode("http%3A%2F%2F http://www.5a520.cn %2F%B9%FA%BC%D2.mp3","GBK");
// System.out.println(s);
}
}