该程序是爬取京东上的Java图书信息
book模型:
private String bookID;
private String bookName;
private String bookPrice;
1)httpclient maven配置:(不同版本创建HttpClient方法不同)
<dependency>
<groupId>org.apache.httpcomponentsgroupId>
<artifactId>httpclientartifactId>
<version>4.1.2version>
dependency>
2)main方法:(获取数据,存放数据)
public class bookMain {
static final Log logger = LogFactory.getLog(bookMain.class); //log4j
public static void main(String[] args) throws Exception {
HttpClient httpclient = new DefaultHttpClient(); //创建HttpClient
String url = "https://search.jd.com/Search?keyword=java&enc=utf-8&wq=java&pvid=f961dczi.8r5joc"; //种子
List books = URLEntity.URLParse(httpclient, url); //通过URLEntity获取实体中的信息
for (Book book : books) {
logger.info("bookId:" + book.getBookID() + "\t" + "bookName:" + book.getBookName() + "\t" + "bookPrice:"
+ book.getBookPrice() + "\t");
}
mysql_control.executeInsert(books); //数据库添加数据
}
}
3)获取response(httpUtil类)
public class httpUtil {
public static HttpResponse getHtml(HttpClient httpclient, String url) throws IOException
{
HttpGet getMethod = new HttpGet(url); /get方法
HttpResponse response = new BasicHttpResponse(HttpVersion.HTTP_1_1,HttpStatus.SC_OK,"ok"); //response初始化
response = httpclient.execute(getMethod); //执行get方法
return response;
}
}
4)返回实体中的信息(URLEntity类)
调用3)获取response
public class URLEntity {
public static List URLParse(HttpClient httpclient,String url) throws IOException
{
List getbooks = new ArrayList();
HttpResponse response = httpUtil.getHtml(httpclient, url);
int statusCode = response.getStatusLine().getStatusCode(); //获取状态码
if(statusCode == 200) //200为正常
{
String entity = EntityUtils.toString(response.getEntity(),"utf-8");
getbooks = bookParse.getData(entity);
EntityUtils.consume(response.getEntity()); //消耗实体类,实体类最后需要消耗
}
else
EntityUtils.consume(response.getEntity());
return getbooks;
}
}
5)解析html(此处使用的是jsoup)bookParse类
public class bookParse {
public static List getData(String html)
{
List datas = new ArrayList();
Document doc = Jsoup.parse(html);
Elements elements = doc.select("ul[class=gl-warp clearfix]").select("li[class=gl-item]");
for (Element element : elements) {
String bookid = element.select("div[class=gl-i-wrap j-sku-item]").attr("data-sku");
String bookprice = element.select("div[class=p-price]").select("strong").select("i").text();
String bookname = element.select("div[class=p-name]").select("em").text();
Book book = new Book();
book.setBookID(bookid);
book.setBookName(bookname);
book.setBookPrice(bookprice);
datas.add(book);
}
return datas;
}
}
public class mysql_source {
public static DataSource getDataSource(String connectURI)
{
BasicDataSource ds = new BasicDataSource();
ds.setDriverClassName("com.mysql.jdbc.Driver");
ds.setUsername("root");
ds.setPassword("wodemima");
ds.setUrl(connectURI);
return ds;
}
}
mysql_control类
public class mysql_control {
static DataSource ds = mysql_source.getDataSource("jdbc:mysql://127.0.0.1:3306/book");
static QueryRunner qr = new QueryRunner(ds);
public static void executeInsert(List bookdatas) throws SQLException
{
Object[][] params = new Object[bookdatas.size()][3];
for(int i=0; i<params.length; i++)
{
params[i][0] = bookdatas.get(i).getBookID();
params[i][1] = bookdatas.get(i).getBookName();
params[i][2] = bookdatas.get(i).getBookPrice();
}
qr.batch("insert into books(bookID,bookNam,bookPrice)values(?,?,?)", params);
System.out.println("成功插入" + bookdatas.size() + "条");
}
}
7)补充
log4j.properties文件内容
log4j.rootLogger=DEBUG, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%-5p - %m%n
只是输出到控制台