java根据 正则表达式解析html网页内容

仅供参考:

import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.cms.common.entity.HttpRespons;

public class test {


	public static void main(String[] args) {
		 getLyric("逐浪飞花");
	}
	 
	 /**
		 * 获取可下载的歌词信息
		 * @param songName  歌曲名称
		 * 
		 */
	 public static List> getLyric(String songName) {
		   List> reqMap = new ArrayList>();
			try {  
	            HttpRequester request = new HttpRequester();  
				String urlNameString = "http://www.lrcgc.com/so/?q="+ songName;
	            HttpRespons hr = request.sendGet(urlNameString);  
	            String content = hr.getContent();
	            //返回内容
//	            System.out.println(content);
	            //获取歌曲信息的结果  如: 逐浪飞花, 
	        	String regex =  "]*href=\"/lyric-[^>]*>.*?";
	            List link = getContentByRegex(content,regex);
//	            System.out.println("路径:"+link);  
	             
	            //歌曲名称列表
		        List songUrlList = match(link.toString(), "a", "href");  
//		        System.out.println("值:"+songUrlList); 
	         
		        //歌手列表
		        List songNameList = getLabelValues(link.toString(),regex);
//		        System.out.println(songNameList);
	            
	          regex =  "]*href=\"/songlist-[^>]*>.*?";
	          link = getContentByRegex(content,regex);
//	          System.out.println("歌手:"+link);  
	        //歌手列表
	          List singerList = getLabelValues(link.toString(),regex);
//	          System.out.println(singerList);
	          
	          for (int i = 0; i < singerList.size(); i++) {
				Map map = new HashMap();
				map.put("singerName", singerList.get(i).replace("&", "&"));
				map.put("songName", songNameList.get(i));
				//下载链接 TODO
				map.put("songUrl", geciDownlrc(songUrlList.get(i)));
				reqMap.add(map);
	          }
	            
	        } catch (Exception e) {  
	            e.printStackTrace();  
	        }
			
			return reqMap;
		}  
		
		/**
		 *  
		 *   下载 歌词信息
		 */
		private static String geciDownlrc(String songUrl) {
			try {  
				HttpRequester request = new HttpRequester();  
				//歌曲名称
//	          http://www.lrcgc.com/lyric-26228-242158.html
				String urlNameString = "http://www.lrcgc.com/"+songUrl;
				HttpRespons hr = request.sendGet(urlNameString);  
				//请求链接    
				String content = hr.getContent();
				//返回内容
//	            System.out.println(content);
				
				//获取歌曲信息的结果  如: 逐浪飞花, 
				String regex =  "]*id=\"J_downlrc\"[^>]*>.*?";
				List link = getContentByRegex(content,regex);
//				System.out.println(link);  
				
				List list = match(link.toString(), "a", "href");  
//				System.out.println("值:"+list); 
				
				String fileName = "";
				String fileUrl = "";
				//获取文件名称
				if (list != null && list.size() > 0) {
					fileUrl = list.get(0).replace("&", "&");
					fileName = fileUrl.substring(fileUrl.indexOf("/")+1,fileUrl.length());
					fileUrl = "http://www.lrcgc.com//"+fileUrl;
				}
				
//				System.out.println("fileUrl:"+fileUrl);
//				System.out.println("fileName:"+fileName);
				
				return fileUrl;
				
			} catch (Exception e) {  
				e.printStackTrace();  
			}
			return "";
		}  
		 
	    
	    /** 
	     * 传入要下载的文件的url,将url所对应的文件下载到本地 
	     * @param urlString  下载的文件的url
	     * @param fileName	 文件名称
	     */
	    public static  void downloadFile(String urlString,String fileName) { 
	    	String localFilePath = "C:\\Users\\Administrator\\Desktop\\"+fileName;
	    	  try {  
	    	      URL url  = new URL(urlString);  
	              DataInputStream dataInputStream = new DataInputStream(url.openStream());  
	              FileOutputStream fileOutputStream = new FileOutputStream(new File(localFilePath));  
	              byte[] buffer = new byte[1024];  
	              int length;  
	              while ((length = dataInputStream.read(buffer)) > 0) {  
	                  fileOutputStream.write(buffer, 0, length);  
	              }  
	              dataInputStream.close();  
	              fileOutputStream.close();  
	          } catch (MalformedURLException e) {  
	              e.printStackTrace();  
	          } catch (IOException e) {  
	              e.printStackTrace();  
	          }  
	    }  
		
		/**
		 * 
		 * @param html
		 * @return 获得网页标题
		 */
		public static String getTitle( String html) {
			String regex;
			String title = "";
			final List list = new ArrayList();
			regex = ".*?";
			final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
			final Matcher ma = pa.matcher(html);
			while (ma.find()) {
				list.add(ma.group());
			}
			for (int i = 0; i < list.size(); i++) {
				title = title + list.get(i);
			}
			return title.replaceAll("<.*?>", "");
		}
	    
		/**
		 * 获取标签中的值
		 * @param html  内容
		 * @param regex  正则表达式
		 * @return 
		 */
		public static List getLabelValues(String html,String regex) {
//			String regex;
			final List list = new ArrayList();
//			regex = "]*href=\"/lyric-[^>]*>(.*?)";
			final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
			final Matcher ma = pa.matcher(html);
			while (ma.find()) {
				list.add(ma.group().replaceAll("<.*?>", ""));
			}
			return list;
		}
		
		
		/**
		 * 获取匹配的正则表达式
		 * @param s 内容
		 * @param regex 正则表达式
		 * @return
		 */
		public static List getContentByRegex(String s,String regex) {
			
			final List list = new ArrayList();
			//获得页面所有的链接
			final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
			final Matcher ma = pa.matcher(s);
			while (ma.find()) {
				list.add(ma.group());
			}
			return list;
		}
	    
		
		/**
		 * 
		 * @param s
		 * @return 获得所有的超链接
		 */
		public List getLink(final String s) {
			String regex;
			final List list = new ArrayList();
			regex = "]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)";
			final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
			final Matcher ma = pa.matcher(s);
			while (ma.find()) {
				list.add(ma.group());
			}
			return list;
		}
		
		 /** 
	     * 获取指定HTML标签的指定属性的值 
	     * @param source 要匹配的源文本 
	     * @param element 标签名称 
	     * @param attr 标签的属性名称 
	     * @return 属性值列表 
	     */  
	    public static List match(String source, String element, String attr) {  
	        List result = new ArrayList();  
	        String reg = "<" + element + "[^<>]*?\\s" + attr + "=['\"]?(.*?)['\"]?(\\s.*?)?>";  
	        Matcher m = Pattern.compile(reg).matcher(source);  
	        while (m.find()) {  
	            String r = m.group(1);  
	            result.add(r);  
	        }  
	        return result;  
	    }  
	      
	   
	 
	 
	 


}

HttpRespons hr = request.sendGet(urlNameString);  

这个方法请参考:http://blog.csdn.net/qq_27292113/article/details/71534346 这里面有详细的代码。


你可能感兴趣的:(web开发,java基础)