public class DoubanBaiKe {
private Logger logger = LoggerFactory.getLogger(this.getClass());
/**
* 豆瓣电影搜索的URL地址
*/
private String url = "http://movie.douban.com/subject_search?search_text=#&cat=1002";
/**
* 豆瓣电影URL地址
*/
private String baikeViewURL = "http://movie.douban.com";
/**
* 豆瓣电影页面的编码
*/
private final static String charset = "utf-8";
private HtmlCleaner cleaner;
/**
* 获取日期的正则表达式
*/
final String dateRegex = "([0-9]{4}-[0-9]{2}-[0-9]{2})|([0-9]{4}-[0-9]{2})|([0-9]{4})";
/**
* 存储捉取不到结果的影视
*/
public static StringBuilder noResult = new StringBuilder();
public DoubanBaiKe() {
initCleaner();
}
/**
* 初始化HtmlCleaner实例,在构造函数里调用
*/
private void initCleaner() {
cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setUseEmptyElementTags(false);
}
/**
* 获取指定影视名称,去豆瓣电影s中捉取页面,并处理主要内容的节点
*
* @param videoName
* 影视名称
* @return
*
*/
private TagNode getContentTagNode(String videoName) {
try {
String searchWordUrl = url.replace("#", URLEncoder.encode(
videoName, charset));
System.out.println(searchWordUrl);
String xpath = "//div[@id='content']//div[@class='article']/table";
TagNode node = cleaner.clean(new URL(searchWordUrl), charset);
TagNode resultNode[] = HtmlCleanerXpathUtil.xpathArray(node, xpath);
if (resultNode == null || resultNode.length <= 0) {
return null;
}
for (TagNode tableNode : resultNode) {
// 捉取影视名称进行匹配
TagNode titleNode = HtmlCleanerXpathUtil.xpathOne(tableNode,
"tbody/tr/td[2]/div");
String titles[] = titleNode.getText().toString().split("/");
for (String title : titles) {
// 对每一个名字进行匹配
if (videoName.equals(title.trim())) {
logger.info("找到相应的链接:" + videoName);
TagNode a_node = titleNode.getChildTags()[0];
// 获取相应的页面链接
String href = a_node.getAttributeByName("href");
TagNode targetNode = cleaner.clean(new URL(href),
charset);
if (targetNode != null) {
return HtmlCleanerXpathUtil
.xpathOne(targetNode,
"//div[@id='content']//div[@class='article']");
}
}
}
}
logger.info("没有找到相应的链接:" + videoName);
} catch (Exception e) {
logger.info(e.getMessage(), e);
}
return null;
}
/**
* 存储图片
*/
private static final String defaultPicDIR="pic";
/**
* 获取影视百科对象
*
* @param name
* 影视名称
* @return
*/
public Video getVideoBaiKe(String name) {
Video video = new Video();
video.setName(name);
TagNode content = getContentTagNode(name);
if (content == null) {
noResult.append(name + "/n");
return video;
}
// 获取图片
TagNode imgNode = HtmlCleanerXpathUtil.xpathOne(content,
"//div[@id='mainpic']/a");
if (imgNode != null) {
String imgHref = imgNode.getAttributeByName("href");
String fileName = DigestUtils.md5Hex(url);
System.out.println(fileName);
fileName=defaultPicDIR+File.separator+fileName+".jpg";
//写入文件
PicFileUtils.writePicToFile(imgHref, fileName);
video.setImg(fileName);
}
// 获取编剧,导演,主演的信息
handleBasicInfo(video, content);
handleBasicOtherInfo(video, content);
return video;
}
/**
* 处理基本信息
*
* @param video
* @param content
*/
public void handleBasicInfo(Video video, TagNode content) {
// 获取编剧,导演,主演的信息
TagNode infoNode[] = HtmlCleanerXpathUtil.xpathArray(content,
"//div[@id='info']/span");
if (infoNode == null) {
return;
}
for (TagNode node : infoNode) {
int point = node.getText().indexOf(":");
String nodeName = node.getText().substring(0, point).trim();
String nodeValue = node.getText().substring(point + 1,
node.getText().length());
if (nodeName.equals("导演")) {
video.setDirector(nodeValue);
} else if (nodeName.equals("编剧")) {
video.setWriter(nodeValue);
} else if (nodeName.equals("主演")) {
video.setRoles(nodeValue);
}
}
}
/**
* 处理其它信息
*
* @param video
* @param content
*/
public void handleBasicOtherInfo(Video video, TagNode content) {
// 获取其它相关信息
TagNode obmoNode = HtmlCleanerXpathUtil.xpathOne(content,
"//div[@id='info']/div[@class='obmo']");
if (obmoNode == null) {
return;
}
String text = cleaner.getInnerHtml(obmoNode).trim();
String s[] = text.split("<br//s*/>");
String regex = "<span//s*class=/"pl/">(.*?)</span>(.*)";
Pattern p = Pattern.compile(regex);
for (String line : s) {
if (line.length() <= 0 || line.trim().isEmpty()) {
continue;
}
Matcher matcher = p.matcher(line);
if (matcher.find()) {
String groupName = matcher.group(1).trim();
String groupValue = matcher.group(2).trim();
System.out.println(groupName);
if ("制片国家/地区:".equals(groupName)) {
video.setZone(groupValue);
} else if ("语言:".equals(groupName)) {
video.setLanguage(groupValue);
} else if ("又名:".equals(groupName)) {
video.setOtherName(groupValue);
} else if ("上映日期:".equals(groupName)) {
Pattern p2 = Pattern.compile(dateRegex);
Matcher dateMatcher = p2.matcher(groupValue);
if (dateMatcher.find()) {
System.out.println(dateMatcher.group(0));
video.setShowDate(dateMatcher.group(0));
}
} else if ("集数:".equals(groupName)) {
video.setEpisodes(groupValue);
System.out.println(groupValue);
}
}
}
}
public static void main(String[] args) {
DoubanBaiKe douban = new DoubanBaiKe();
System.out.println(douban.getVideoBaiKe("导火线"));
System.out.println(DoubanBaiKe.noResult.toString());
}
}