写一个jsp页面,访问时显示从下面的页面提取出的销售商详细信息:价格、运费、经销
商基本信息等,并写到一个数据库表里。
http://www.amazon.com/gp/offer-listing/B0012J52OC/
数据抓取的问题,涉及到
1 用URLConnection 读取页面信息,用httpclient也行
2 用Pattern 解析页面并拿到你要的信息
3 显示数据
4 存入数据库
这个是一个综合的考试,涉及的知识面比较广。
1 我这里只给出关键的部分,使用java程序实现,而不是JSP的代码。移植工作请自行完成。
2 我使用自己的数据库连接,请替换为应用服务器提供的数据源为好
3 代码分三部分,数据库结构,POJO类和应用程序
一、数据库结构 AmazonGoods.sql 使用的是MySQL的数据库
-
-
-
- CREATE TABLE `amazongoods` (
- `id` int(11) NOT NULL AUTO_INCREMENT,
- `price` decimal(10,0) NOT NULL,
- `shipping` decimal(10,0) NOT NULL,
- `Seller` text NOT NULL,
- PRIMARY KEY (`id`)
- ) ENGINE=InnoDB AUTO_INCREMENT=11 DEFAULT CHARSET=utf8;
-- ----------------------------
-- Table structure for amazongoods
-- ----------------------------
CREATE TABLE `amazongoods` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`price` decimal(10,0) NOT NULL,
`shipping` decimal(10,0) NOT NULL,
`Seller` text NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=11 DEFAULT CHARSET=utf8;
二、POJO类 AmazonGoods.java
- package com.laozizhu.test.amazon;
-
- import java.math.BigDecimal;
-
-
-
-
-
-
-
- class AmazonGoods {
- public long getId() {
- return id;
- }
-
- public void setId(long id) {
- this.id = id;
- }
-
- public BigDecimal getPrice() {
- return price;
- }
-
- public void setPrice(BigDecimal price) {
- this.price = price;
- }
-
- public BigDecimal getShipping() {
- return shipping;
- }
-
- public void setShipping(BigDecimal shipping) {
- this.shipping = shipping;
- }
-
- public String getSeller() {
- return seller;
- }
-
- public void setSeller(String seller) {
- this.seller = seller;
- }
-
-
- private long id;
-
- private BigDecimal price;
-
- private BigDecimal shipping;
-
- private String seller;
- }
package com.laozizhu.test.amazon;
import java.math.BigDecimal;
/**
* 某一行的商品数据
*
* @author 老紫竹的家(laozizhu.com)
*
*/
class AmazonGoods {
public long getId() {
return id;
}
public void setId(long id) {
this.id = id;
}
public BigDecimal getPrice() {
return price;
}
public void setPrice(BigDecimal price) {
this.price = price;
}
public BigDecimal getShipping() {
return shipping;
}
public void setShipping(BigDecimal shipping) {
this.shipping = shipping;
}
public String getSeller() {
return seller;
}
public void setSeller(String seller) {
this.seller = seller;
}
// 序列号,主键
private long id;
// 价格
private BigDecimal price;
// 运费
private BigDecimal shipping;
// 商家信息
private String seller;
}
三、应用类
- package com.laozizhu.test.amazon;
-
- import java.math.BigDecimal;
- import java.sql.Connection;
- import java.sql.DriverManager;
- import java.sql.PreparedStatement;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Properties;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
-
- import com.laozizhu.tools.PageService;
-
- public class AmazonFetch {
-
-
-
-
- public static void main(String[] args) {
-
- initProxy();
-
-
- String str = PageService.getPage("http://www.amazon.com/gp/offer-listing/B0012J52OC/", "ISO-8859-1");
-
- List<AmazonGoods> list = parse(str);
-
- buildTable(list);
-
- saveToMySQL(list);
- }
-
-
-
-
- private static void initProxy() {
- Properties prop = System.getProperties();
-
-
- prop.setProperty("http.proxyHost", "10.60.8.20");
-
- prop.setProperty("http.proxyPort", "8080");
- }
-
-
-
-
- static Pattern pPrice = Pattern.compile(
- "<span class=\"price\">\\$([\\d]+\\.[\\d]{2})</span>.*?(<ul class=\"sellerInformation\">.+?</ul>)", Pattern.DOTALL);
-
-
- static Pattern pShipping = Pattern
- .compile("<span class=\"price_shipping\">\\+ \\$([\\d]+\\.[\\d]{2})</span>", Pattern.DOTALL);
-
-
-
-
-
-
-
-
- private static List<AmazonGoods> parse(String page) {
-
-
- String[] strs = page.split("<tbody class=\"result\">");
-
-
- List<AmazonGoods> list = new ArrayList<AmazonGoods>(strs.length);
- AmazonGoods goods = null;
-
- for (String str : strs) {
-
-
- Matcher m = pPrice.matcher(str);
- if (m.find()) {
- goods = new AmazonGoods();
- goods.setPrice(new BigDecimal(m.group(1)));
-
-
- goods.setSeller(m.group(2));
-
-
- m = pShipping.matcher(str);
- if (m.find()) {
- goods.setShipping(new BigDecimal(m.group(1)));
- }
-
- list.add(goods);
- } else {
-
- continue;
- }
- }
- return list;
- }
-
- private static String buildTable(List<AmazonGoods> list) {
- StringBuilder b = new StringBuilder("<table>");
- b.append("<tr><th>价格</th><th>运费</th><th>商家信息</th></tr>");
- for (AmazonGoods goods : list) {
- b.append("<tr><th>" + goods.getPrice() + "</th><th>" + goods.getShipping() + "</th><th>" + goods.getSeller()
- + "</th></tr>");
- }
- b.append("</table>");
- return b.toString();
- }
-
- private static void saveToMySQL(List<AmazonGoods> list) {
-
-
-
- Connection con = null;
- PreparedStatement st = null;
- String url = "jdbc:mysql://localhost:3306/";
- String db = "test";
- String driver = "com.mysql.jdbc.Driver";
- String user = "test";
- String pass = "test";
- BigDecimal ZERO = new BigDecimal("0");
- try {
- Class.forName(driver);
- con = DriverManager.getConnection(url + db, user, pass);
- st = con.prepareStatement("insert into AmazonGoods (price,shipping,seller) values(?,?,?)");
- for (AmazonGoods goods : list) {
- st.setBigDecimal(1, goods.getPrice());
- st.setBigDecimal(2, goods.getShipping()==null?ZERO:goods.getShipping());
- st.setString(3, goods.getSeller());
- if (st.executeUpdate() <= 0) {
- throw new Exception("保存数据错误!");
- }
- st.clearParameters();
- }
-
- } catch (Exception ex) {
- ex.printStackTrace();
- } finally {
- if (st != null) {
- try {
- st.close();
- } catch (Exception ex) {
- }
- }
- if (con != null) {
- try {
- con.close();
- } catch (Exception ex) {
- }
- }
- }
- }
- }
package com.laozizhu.test.amazon;
import java.math.BigDecimal;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.laozizhu.tools.PageService;
public class AmazonFetch {
/**
* @param args
*/
public static void main(String[] args) {
// 我这里需要设置代理,如果你能直接访问互联网,则无需这段代码了
initProxy();
// 读取页面数据
String str = PageService.getPage("http://www.amazon.com/gp/offer-listing/B0012J52OC/", "ISO-8859-1");
// 解析页面,拿到商品信息
List<AmazonGoods> list = parse(str);
// 生成HTML表格
buildTable(list);
// 存入数据库
saveToMySQL(list);
}
/**
* 简单的代理服务器,无需密码认证
*/
private static void initProxy() {
Properties prop = System.getProperties();
// prop.put("proxySet", "true");
// 设置http访问要使用的代理服务器的地址
prop.setProperty("http.proxyHost", "10.60.8.20");
// 设置http访问要使用的代理服务器的端口
prop.setProperty("http.proxyPort", "8080");
}
// 注意,美元符号要转义
// 因为报价都包含小数点,所以用数字+小数点+2位小数即可
// 商家信息包含了对应的标签
static Pattern pPrice = Pattern.compile(
"<span class=\"price\">\\$([\\d]+\\.[\\d]{2})</span>.*?(<ul class=\"sellerInformation\">.+?</ul>)", Pattern.DOTALL);
// 运费
// <span class="price_shipping">+ $6.04</span>
static Pattern pShipping = Pattern
.compile("<span class=\"price_shipping\">\\+ \\$([\\d]+\\.[\\d]{2})</span>", Pattern.DOTALL);
/**
* 解析页面,获得商品列表
*
* @param page
* 页面
* @return 商品列表
*/
private static List<AmazonGoods> parse(String page) {
// 首先,把商品分成多个字符串片段
// 分割符就是表格里的内容了。这个得查看HTML源代码才能找到合适的
String[] strs = page.split("<tbody class=\"result\">");
// 构造结果
// 默认长度为片段的长度,呵呵
List<AmazonGoods> list = new ArrayList<AmazonGoods>(strs.length);
AmazonGoods goods = null;
// 循环解析每个商品片段
for (String str : strs) {
// 注意,不是每个商品都有运费,所以正则最好不要写一个
// 当然,你愿意弄复杂了也行,我个人不推荐这么做
Matcher m = pPrice.matcher(str);
if (m.find()) {
goods = new AmazonGoods();
goods.setPrice(new BigDecimal(m.group(1)));
// 这里面包含了HTML的信息,包括Javascript内容,不过比较难删除
// 因为有些页面文字是用js显示的,还是保留的比较好
goods.setSeller(m.group(2));
// 查找运费
m = pShipping.matcher(str);
if (m.find()) {
goods.setShipping(new BigDecimal(m.group(1)));
}
// 将商品加入列表
list.add(goods);
} else {
// 没有找到价格,则这部分不包含商品信息,无需继续
continue;
}
}
return list;
}
private static String buildTable(List<AmazonGoods> list) {
StringBuilder b = new StringBuilder("<table>");
b.append("<tr><th>价格</th><th>运费</th><th>商家信息</th></tr>");
for (AmazonGoods goods : list) {
b.append("<tr><th>" + goods.getPrice() + "</th><th>" + goods.getShipping() + "</th><th>" + goods.getSeller()
+ "</th></tr>");
}
b.append("</table>");
return b.toString();
}
private static void saveToMySQL(List<AmazonGoods> list) {
// 这里就用最原始的方法获得数据库连接了。
// 数据库结构请参考AmazonGoods.sql
// 使用test的数据库
Connection con = null;
PreparedStatement st = null;
String url = "jdbc:mysql://localhost:3306/";
String db = "test";
String driver = "com.mysql.jdbc.Driver";
String user = "test";
String pass = "test";
BigDecimal ZERO = new BigDecimal("0");
try {
Class.forName(driver);
con = DriverManager.getConnection(url + db, user, pass);
st = con.prepareStatement("insert into AmazonGoods (price,shipping,seller) values(?,?,?)");
for (AmazonGoods goods : list) {
st.setBigDecimal(1, goods.getPrice());
st.setBigDecimal(2, goods.getShipping()==null?ZERO:goods.getShipping());
st.setString(3, goods.getSeller());
if (st.executeUpdate() <= 0) {
throw new Exception("保存数据错误!");
}
st.clearParameters();
}
} catch (Exception ex) {
ex.printStackTrace();
} finally {
if (st != null) {
try {
st.close();
} catch (Exception ex) {
}
}
if (con != null) {
try {
con.close();
} catch (Exception ex) {
}
}
}
}
}
四、辅助类 PageService.java
- package com.laozizhu.tools;
-
- import java.io.BufferedReader;
- import java.io.FileNotFoundException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.OutputStream;
- import java.net.ConnectException;
- import java.net.HttpURLConnection;
- import java.net.URL;
- import java.util.zip.GZIPInputStream;
-
-
-
-
-
-
- public class PageService {
- private static final String BR = "\r\n";
-
-
-
-
-
-
-
-
- public static String getPage(String page) {
- return getPage(page, "UTF-8");
- }
-
-
-
-
-
-
-
-
-
-
- public static String getPage(String page, String charset) {
- String str = null;
- int count = 3;
- do {
- str = _getPage(page, charset);
- if (str == null || str.length() == 0) {
- try {
- Thread.sleep(1000);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- } while (str == null && count-- > 0);
- return str;
- }
-
- private static String _getPage(String page, String charset) {
- try {
- URL url = new URL(page);
- HttpURLConnection con = (HttpURLConnection) url.openConnection();
-
- con
- .setRequestProperty(
- "User-Agent",
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
- int index = page.indexOf("/", 10);
- con.setRequestProperty("Host", index == -1 ? page.substring(7) : page
- .substring(7, index));
- InputStream is = con.getInputStream();
- if (con.getContentEncoding() != null
- && con.getContentEncoding().equalsIgnoreCase("gzip")) {
- is = new GZIPInputStream(con.getInputStream());
- }
- BufferedReader reader = new BufferedReader(new InputStreamReader(is,
- charset));
- StringBuilder b = new StringBuilder();
- String line;
- while ((line = reader.readLine()) != null) {
- b.append(line);
- b.append(BR);
- }
- reader.close();
- return b.toString();
- } catch (FileNotFoundException ex) {
- System.out.println("NOT FOUND:" + page);
- return null;
- } catch (ConnectException ex) {
- System.out.println("Timeout:" + page);
- return null;
- } catch (Exception ex) {
- ex.printStackTrace();
- return null;
- }
- }
-
- public static String postPage(String page, String msg) throws Exception {
- URL url = new URL(page);
- HttpURLConnection con = (HttpURLConnection) url.openConnection();
- con.setDoOutput(true);
- con
- .setRequestProperty(
- "User-Agent",
- "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
- int index = page.indexOf("/", 10);
- con.setRequestProperty("Host", index == -1 ? page.substring(7) : page
- .substring(7, index));
- con.setRequestMethod("POST");
- con.addRequestProperty("Content-Type", "application/x-www-form-urlencoded");
- OutputStream os = con.getOutputStream();
- os.write(msg.getBytes("UTF-8"));
- InputStream is = con.getInputStream();
- if (con.getContentEncoding() != null
- && con.getContentEncoding().equalsIgnoreCase("gzip")) {
- is = new GZIPInputStream(con.getInputStream());
- }
- BufferedReader reader = new BufferedReader(new InputStreamReader(is,
- "UTF-8"));
- StringBuilder b = new StringBuilder();
- String line;
- while ((line = reader.readLine()) != null) {
- b.append(line);
- b.append(BR);
- }
- os.close();
- reader.close();
- return b.toString();
- }
- }