com.geccocrawler
gecco
1.1.0
com.geccocrawler
gecco-htmlunit
1.0.5
com.alibaba
fastjson
1.2.38
com.belerweb
pinyin4j
2.5.0
mysql
mysql-connector-java
5.1.34
package org.ssgroup.spider.constant;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* @date 2018-11-09
*/
public class MaYiAllCityConstant {
public static boolean on = true;
/**
* 蚂蚁短租所有城市
*/
public static Map ALL_CITY = new LinkedHashMap();
/**
* 蚂蚁短租位置类型
*/
public static String[] CITY_LOCATION = new String[] {"商圈","景点 ","行政区 ","车站机场 ","地铁","学校","医院"};
static {
//国内热门城市
String[] holdCity = new String[] {"北京","上海","青岛","香港","成都","杭州","台北","三亚","大连","广州","西安","重庆","厦门","秦皇岛","屏东","花莲",
"深圳","南京","苏州","烟台","威海","天津","北海","武汉","葫芦岛","长沙","哈尔滨","沈阳","昆明","营口"};
ALL_CITY.put("holdCity", holdCity);
//ABCD
String[] abcd = new String[] {"鞍山","安阳","阿拉善","澳门","安顺","安庆","安康","阿勒泰",
"北京","北戴河","北海","保定","本溪","包头","白山","宝鸡","蚌埠","博鳌","保山","百色","巴音郭楞","博尔塔拉","保亭","白沙",
"成都","重庆","长沙","长春","承德","常州","长白山","赤峰","沧州","郴州","长治","潮州","常德","滁州","池州","昌吉","昌江",
"大连","东戴河","丹东","大理","东莞","大同","德州","德阳","敦煌","东营","迪庆","都江堰","东方","德宏","定西","儋州","大兴安岭"};
ALL_CITY.put("ABCD", abcd);
//EFGH
String[] efgh = new String[] {"洱海","峨眉山","恩施","鄂尔多斯","鄂州",
"福州","佛山","防城港","凤凰","抚顺","阜阳","抚州","阜新",
"广州","贵阳","高雄","桂林","赣州","甘孜","广元","贵港","鼓浪屿","甘南",
"杭州","花莲","葫芦岛","哈尔滨","合肥","海口","惠州","湖州","莫干山","呼和浩特","黄山","呼伦贝尔","横店",
"邯郸","衡水","淮安","衡阳","黑河","汉中","菏泽","红河","河源","黄龙","海西","海北","贺州","淮北","怀化","河池","黄石","海拉尔",
"淮南","海东","鹤岗","和田"};
ALL_CITY.put("EFGH", efgh);
//JKLM
String[] jklm = new String[] {"济南","基隆","嘉义","金门","嘉兴","锦州","吉林","济宁","九江","江门","焦作","景德镇","佳木斯","荆州","嘉峪关","九寨沟","晋城",
"吉安","揭阳","鸡西","济源",
"昆明","开封","克拉玛依","库尔勒",
"丽江","临高","兰州","洛阳","临沂","乐山","廊坊","连云港","柳州","庐山","凉山","聊城","拉萨","临汾","丽水","六盘水","泸州","龙岩","吕梁",
"陵水","辽源","陇南","林芝","临夏","乐东","临沧",
"苗栗","马祖","绵阳","茂名","梅州","眉山","马鞍山"};
ALL_CITY.put("JKLM", jklm);
//NPQR
String[] npqr = new String[] {"南戴河","南京","南投","宁波","南宁","南昌","南通","南阳","南充","南平","宁德","内江",
"屏东","澎湖","普陀山","盘锦","平遥","莆田","平顶山","蓬莱","攀枝花","普洱","萍乡","平凉",
"青岛","千岛湖","秦皇岛","泉州","清远","黔东南","齐齐哈尔","衢州","黔南","黔西南","曲靖","钦州",
"日照","日喀则"};
ALL_CITY.put("NPQR", npqr);
//STW
String[] stw = new String[] {"上海","上海迪士尼","三亚","深圳","苏州","沈阳","石家庄","绍兴","汕头","上饶","韶关","四平","三明","松原","十堰","神农架","遂宁","石河子",
"绥化","商洛","随州","三沙",
"台北","天涯海角","同里","天津","台东","台南","台中","桃园","太原","唐山","泰安","泰山","台州","通化","泰州","天水","通辽","吐鲁番","塔城","屯昌",
"威海","武汉","温州","无锡","乌鲁木齐","潍坊","武夷山","芜湖","文昌","梧州","渭南","文山","万宁","武威","五指山"};
ALL_CITY.put("STW", stw);
//XYZs
String[] xyz = new String[] {"香港","西湖","西安","厦门","新北","新竹","西塘","西宁","徐州","西双版纳","新乡","雪乡","咸阳","邢台","湘西","湘潭","信阳","锡林郭勒","许昌","忻州",
"宣城","襄樊","兴安","宿迁","咸宁","宿州","孝感",
"烟台","营口","云林","宜兰","扬州","银川","延边","阳江","宜昌","盐城","宜宾","延安","运城","玉溪","伊春","伊犁","雅安","宜春","岳阳","玉林","榆林",
"益阳","阳朔","洋浦",
"周庄","郑州","珠海","彰化","舟山","中山","张家口","张家界","漳州","湛江","淄博","遵义","枣庄","镇江","株洲","肇庆","自贡","张掖","中卫","周口",
"驻马店","昭通","资阳"};
ALL_CITY.put("XYZ", xyz);
}
}
package org.ssgroup.spider.htmlBean;
import java.util.List;
import org.ssgroup.spider.htmlBean.domain.MaYiRoom;
import org.ssgroup.spider.htmlBean.domain.Page;
import org.ssgroup.spider.htmlBean.domain.list.CarOrAirport;
import org.ssgroup.spider.htmlBean.domain.list.Hospital;
import org.ssgroup.spider.htmlBean.domain.list.OfficeAreas;
import org.ssgroup.spider.htmlBean.domain.list.Offices;
import org.ssgroup.spider.htmlBean.domain.list.ScenicArea;
import org.ssgroup.spider.htmlBean.domain.list.School;
import org.ssgroup.spider.htmlBean.domain.list.ShopLoops;
import org.ssgroup.spider.htmlBean.domain.list.SubWayLine;
import org.ssgroup.spider.htmlBean.domain.list.SubWayStation;
import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.annotation.HtmlField;
import com.geccocrawler.gecco.annotation.Request;
import com.geccocrawler.gecco.annotation.RequestParameter;
import com.geccocrawler.gecco.annotation.Text;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.HtmlBean;
/**
* 蚂蚁短租列表页
* @author HX-011
* @date 2018-11-09
*/
@Gecco(matchUrl="http://www.mayi.com/{city}/{code}", pipelines="maYiListPipeline")
public class MaYiListHtmlBean implements HtmlBean{
private static final long serialVersionUID = -5332646457923675928L;
@Request
private HttpRequest request;
/**
* 城市参数
*/
@RequestParameter("city")
private String city;
/**
* 请求分页参数
*/
@RequestParameter("code")
private String code;
/**
* 页面分页参数
*/
@Text
@HtmlField(cssPath="#page > a.pg-active")
private String page;
/**
* 分页总数
*/
@HtmlField(cssPath="#page > input[type=hidden]")
private List pages;
/**
* 获取所有房源数据
*/
@HtmlField(cssPath="#searchRoom > dd")
private List room;
/**
* 位置类型
*/
// @Text
// @HtmlField(cssPath="#position > div.rt-word.position_choose > div.item.next > div > a")
// private List locations;
/**
* 商圈:type=1
*/
@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-business.pr60 > div > span")
private List shopLoops;
/**
* 景点:type=2
*/
@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-scenic > div > a")
private List scenicAreas;
/**
* 行政区:type=3
*/
@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-area > a")
private List officeAreas;
/**
* 行政区:子区域
*/
@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-area > div")
private List offices;
/**
* 车站机场:type=4
*/
@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-traffic > div > span")
private List carOrAirport;
/**
* 地铁:type=5
*/
@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-metro > a")
private List subWayLine;
/**
* 地铁站详细:
*/
@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-metro > div")
private List subWayStation;
/**
* 学校:type=6
*/
@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-school > div > a")
private List school;
/**
* 医院:type=7
*/
@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-hospital > div > a")
private List hospital;
public HttpRequest getRequest() {
return request;
}
public void setRequest(HttpRequest request) {
this.request = request;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getPage() {
return page;
}
public void setPage(String page) {
this.page = page;
}
public void setCode(String code) {
this.code = code;
}
public String getCode() {
return code;
}
public void setRoom(List room) {
this.room = room;
}
public List getRoom() {
return room;
}
public void setPages(List pages) {
this.pages = pages;
}
public List getPages() {
return pages;
}
public List getShopLoops() {
return shopLoops;
}
public void setShopLoops(List shopLoops) {
this.shopLoops = shopLoops;
}
public List getScenicAreas() {
return scenicAreas;
}
public void setScenicAreas(List scenicAreas) {
this.scenicAreas = scenicAreas;
}
public List getOfficeAreas() {
return officeAreas;
}
public void setOfficeAreas(List officeAreas) {
this.officeAreas = officeAreas;
}
public void setOffices(List offices) {
this.offices = offices;
}
public List getOffices() {
return offices;
}
public List getCarOrAirport() {
return carOrAirport;
}
public void setCarOrAirport(List carOrAirport) {
this.carOrAirport = carOrAirport;
}
public void setSubWayLine(List subWayLine) {
this.subWayLine = subWayLine;
}
public List getSubWayLine() {
return subWayLine;
}
public void setSubWayStation(List subWayStation) {
this.subWayStation = subWayStation;
}
public List getSubWayStation() {
return subWayStation;
}
public List getSchool() {
return school;
}
public void setSchool(List school) {
this.school = school;
}
public List getHospital() {
return hospital;
}
public void setHospital(List hospital) {
this.hospital = hospital;
}
// public void setLocations(List locations) {
// this.locations = locations;
// }
// public List getLocations() {
// return locations;
// }
}
package org.ssgroup.spider.service;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.ssgroup.spider.Application;
import org.ssgroup.spider.constant.MaYiAllCityConstant;
import org.ssgroup.spider.htmlBean.MaYiListHtmlBean;
import org.ssgroup.spider.htmlBean.domain.MaYiRoom;
import org.ssgroup.spider.htmlBean.domain.list.CarOrAirport;
import org.ssgroup.spider.htmlBean.domain.list.Hospital;
import org.ssgroup.spider.htmlBean.domain.list.Office;
import org.ssgroup.spider.htmlBean.domain.list.OfficeAreas;
import org.ssgroup.spider.htmlBean.domain.list.Offices;
import org.ssgroup.spider.htmlBean.domain.list.ScenicArea;
import org.ssgroup.spider.htmlBean.domain.list.School;
import org.ssgroup.spider.htmlBean.domain.list.ShopLoops;
import org.ssgroup.spider.htmlBean.domain.list.Station;
import org.ssgroup.spider.htmlBean.domain.list.SubWayLine;
import org.ssgroup.spider.htmlBean.domain.list.SubWayStation;
import org.ssgroup.spider.htmlBean.domain.list.Vehicle;
import org.ssgroup.spider.utils.JdbcUtils;
import org.ssgroup.spider.utils.PinYinUtils;
import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;
@PipelineName("maYiListPipeline")
public class MaYiListPipeline implements Pipeline{
public void process(MaYiListHtmlBean bean) {
try {
List rooms = bean.getRoom();
if(null!=rooms && rooms.size()>0) {
String cityPinYin = bean.getCity();
//保存位置类型
saveLocation(bean, cityPinYin);
//保存房源
saveRooms(rooms,cityPinYin);
//继续抓取分页数据
String pageStr = StringUtils.isNotBlank(bean.getPage())?bean.getPage():"1";
int page = Integer.parseInt(pageStr) + 1;
int pageCount = bean.getPages().get(0).getPage();
System.out.println("城市拼音【"+bean.getCity()+"】,第【"+pageStr+"】次分页抓取,总分页数【"+pageCount+"】,每一次抓取数量【"+bean.getRoom().size()+"】");
if(page>0 && page<=pageCount) {
HttpRequest request = bean.getRequest();
String nextUrl = request.getUrl();
if(StringUtils.isNoneBlank(nextUrl)) {
String baseUrl = StringUtils.substringBeforeLast(nextUrl, "/");
nextUrl = baseUrl+"/"+page;
System.out.println("分页请求地址【"+nextUrl+"】");
SchedulerContext.into(request.subRequest(nextUrl));
}
}
//如果最后一页抓取完成,执行下一个城市
if(page==pageCount) {
MaYiAllCityConstant.on=true;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 保存房源数据
* @param rooms
* @param city
* @throws SQLException
*/
private void saveRooms(List rooms,String city) throws SQLException{
Connection conn = null;
PreparedStatement pstmt = null;
try {
conn = JdbcUtils.getConnection();
conn.setAutoCommit(false);
String sql = "INSERT INTO rooms(id,price,house_location,original_url,image_url,title,num_room,num_house,city,city_id) " +
"VALUES(?,?,?,?,?,?,?,?,?,?)";
pstmt = conn.prepareStatement(sql);
for(MaYiRoom room : rooms) {
pstmt.setLong(1, room.getRoomId());
pstmt.setFloat(2, room.getPrice());
pstmt.setString(3, room.getPosition());
pstmt.setString(4, room.getOriginalImageUrl());
pstmt.setString(5, room.getImageUrl());
pstmt.setString(6, room.getTitle());
pstmt.setString(7, room.getRooms());
pstmt.setString(8, room.getHousing());
pstmt.setString(9, city);
pstmt.setLong(10, Application.CITY_CACHE.get(city).getId());
pstmt.addBatch();
}
pstmt.executeBatch();
conn.commit();
}catch (Exception e) {
e.printStackTrace();
conn.rollback();
}finally {
JdbcUtils.close(conn, pstmt, null);
}
}
private void saveLocation(MaYiListHtmlBean bean,String city) throws Exception{
Connection conn = null;
PreparedStatement pstmt = null;
ResultSet resultSet = null;
String sql = "INSERT INTO city_location(name,pin_yin,city_id,city_name,city_pin_yin,parent_id,href) " +
"VALUES(?,?,?,?,?,?,?)";
try {
conn = JdbcUtils.getConnection();
//"商圈","景点 ","行政区 ","车站机场 ","地铁"," 学校"," 医院"
String[] cityLocation = MaYiAllCityConstant.CITY_LOCATION;
for(int i=1;i<=cityLocation.length;i++) {
String location = cityLocation[i-1];
conn.setAutoCommit(false);
pstmt = conn.prepareStatement(sql,Statement.RETURN_GENERATED_KEYS);
//城市ID
String tmpLocaltion = PinYinUtils.convertLower(location);
Long city_id = Application.CITY_LOCATION_CACHE.get(tmpLocaltion).getId();
switch (i) {
case 1: //商圈
List shopLoop = bean.getShopLoops();
if(null!=shopLoop && shopLoop.size()>0) {
for(ShopLoops sl : shopLoop) {
String name = sl.getLocation();
addBatch(pstmt, name, city, city_id, sl.getHref());
}
}
break;
case 2: //景点
List scenicAreas = bean.getScenicAreas();
if(null!=scenicAreas && scenicAreas.size()>0) {
for(ScenicArea scenicArea : scenicAreas) {
addBatch(pstmt, scenicArea.getScenic(), city, city_id, scenicArea.getHref());
}
}
break;
case 3: //行政区
List officeAreas = bean.getOfficeAreas();
if(null!=officeAreas && officeAreas.size()>0) {
for(OfficeAreas officeArea : officeAreas) {
addBatch(pstmt, officeArea.getOfficeAreas(), city, city_id, null);
}
}
break;
case 4: //车站机场
List carOrAirport = bean.getCarOrAirport();
if(null!=carOrAirport && carOrAirport.size()>0) {
for(CarOrAirport ca : carOrAirport) {
addBatch(pstmt, ca.getName(), city, city_id, null);
}
}
break;
case 5: //地铁
List subWayLine = bean.getSubWayLine();
if(null!=subWayLine && subWayLine.size()>0) {
for(SubWayLine swl : subWayLine) {
addBatch(pstmt, swl.getName(), city, city_id, null);
}
}
break;
case 6: //学校
List school = bean.getSchool();
if(null!=school && school.size()>0) {
for(School s : school) {
addBatch(pstmt, s.getName(), city, city_id, s.getHref());
}
}
break;
case 7: //医院
List hospital = bean.getHospital();
if(null!=hospital && hospital.size()>0) {
for(Hospital h : hospital) {
addBatch(pstmt, h.getName(), city, city_id, h.getHref());
}
}
break;
}
pstmt.executeBatch();
conn.commit();
//添加子节点
if(i==3||i==4||i==5) {
//获取结果 自增ID
ResultSet rs = pstmt.getGeneratedKeys();
List list = new ArrayList();
while(rs.next()) {
list.add(rs.getLong(1));//取得ID
}
conn.setAutoCommit(false);
pstmt = conn.prepareStatement(sql,Statement.RETURN_GENERATED_KEYS);
switch (i) {
case 3:
List offices = bean.getOffices();
if(null!=offices && offices.size()>0) {
for(int j=0;j cas = bean.getCarOrAirport();
if(null!=cas && cas.size()>0) {
for(int j=0;j vehicles = cas.get(j).getVehicles();
if(null!=vehicles && vehicles.size()>0) {
for(Vehicle vehicle : vehicles) {
addBatch(pstmt, vehicle.getVehicle(), city, list.get(j), vehicle.getHref());
}
}
}
}
break;
case 5:
List subWayStation = bean.getSubWayStation();
if(null!=subWayStation && subWayStation.size()>0) {
for(int j=0;j stations = subWayStation.get(j).getStation();
for(Station station : stations) {
addBatch(pstmt, station.getName(), city, list.get(j), station.getHref());
}
}
}
break;
}
pstmt.executeBatch();
conn.commit();
}
}
}catch (Exception e) {
e.printStackTrace();
conn.rollback();
}finally {
JdbcUtils.close(conn, pstmt, resultSet);
}
}
private void addBatch(PreparedStatement pstmt,String name,String city,Long city_id,String href) throws SQLException {
pstmt.setString(1, name);
pstmt.setString(2, PinYinUtils.convertLower(name));
pstmt.setLong(3, Application.CITY_CACHE.get(city).getId());
pstmt.setString(4, Application.CITY_CACHE.get(city).getName());
//pstmt.setString(5, PinYinUtils.convertLower(Application.CITY_CACHE.get(city).getName()));
pstmt.setString(5, city);
pstmt.setLong(6, city_id);
pstmt.setString(7, href);
pstmt.addBatch();
}
}
package org.ssgroup.spider.utils;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
public class JdbcUtils {
private static final String USERNAME = "root";
private static final String PASSWORD = "root";
private static final String DRIVER = "com.mysql.jdbc.Driver";
private static final String URL = "jdbc:mysql://192.168.8.110:3306/test";
static {
try {
Class.forName(DRIVER);
System.out.println("数据库连接成功!");
} catch (Exception e) {
e.printStackTrace();
}
}
public static Connection getConnection() throws SQLException {
return DriverManager.getConnection(URL, USERNAME, PASSWORD);
}
public static void close(Connection connection,PreparedStatement pstmt,ResultSet resultSet) throws SQLException {
if(null!=resultSet) resultSet.close();
if(null!=pstmt) pstmt.close();
if(null!=connection) connection.close();
}
}
package org.ssgroup.spider.utils;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.commons.lang3.StringUtils;
/**
* 针对中文转化拼音处理
*/
public class PinYinUtils {
/**
* 转为大写字母, 如:中国人民银行 =====>ZHONGGUORENMINYINHAN
* @author lance
* 2016年1月16日 下午4:56:07
*/
public static String convertUpper(String text){
return convert(text, HanyuPinyinCaseType.UPPERCASE, false);
}
/**
* 转为小写字母, 如:中国人民银行 =====>zhongguorenminyinhang
* @author lance
* 2016年1月16日 下午4:56:07
*/
public static String convertLower(String text){
return convert(text, HanyuPinyinCaseType.LOWERCASE, false);
}
/**
* 首字母大写, 如:中国人民银行 =====>ZhongGuoRenMinYinHang
* @author lance
* 2016年1月16日 下午5:04:11
*/
public static String converCapitalize(String text){
return convert(text, null, true);
}
/**
* 所有中文的第一个字母大写, 如:中国人民银行 =====>ZGRMYH
* @author lance
* 2016年1月17日 下午10:16:19
*/
public static String capitalizeLetter(String text){
String c = converCapitalize(text);
if(StringUtils.isBlank(c)) {
return "";
}
return StringUtils.replacePattern(c, "[a-z]", "");
}
/**
* 获取首字母, 如:中国人民银行 =====>Z
* @author lance
* 2016年1月17日 下午10:11:57
*/
public static String firstLetter(String text){
String c = converCapitalize(text);
if(StringUtils.isBlank(c)) {
return "";
}
return StringUtils.substring(c, 0, 1);
}
/**
* 转为拼音
* @param text 待转化的中文字符
* @param caseType 转化类型, 即大写小写
* @param isCapitalize 是否首字母大写
* @author lance
* 2016年1月17日 下午10:28:05
*/
public static String convert(String text, HanyuPinyinCaseType caseType, boolean isCapitalize) {
if(StringUtils.isBlank(text)){
return "";
}
HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
if(caseType != null) {
format.setCaseType(caseType);
isCapitalize = false;
}
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
format.setVCharType(HanyuPinyinVCharType.WITH_V);
char[] input = StringUtils.trimToEmpty(text).toCharArray();
StringBuilder builder = new StringBuilder();
try {
for (char c: input) {
if (Character.toString(c).matches("[\\u4E00-\\u9FA5]+")) {
String[] temp = PinyinHelper.toHanyuPinyinStringArray(c, format);
if(isCapitalize) {
builder.append(StringUtils.capitalize(temp[0]));
}else {
builder.append(temp[0]);
}
} else {
if(isCapitalize) {
builder.append(StringUtils.capitalize(Character.toString(c)));
}else {
builder.append(Character.toString(c));
}
}
}
} catch (BadHanyuPinyinOutputFormatCombination ex) {
ex.printStackTrace();
}
return builder.toString();
}
}
package org.ssgroup.spider;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.ssgroup.spider.constant.MaYiAllCityConstant;
import org.ssgroup.spider.domain.City;
import org.ssgroup.spider.domain.CityLocation;
import org.ssgroup.spider.utils.JdbcUtils;
import org.ssgroup.spider.utils.PinYinUtils;
import com.geccocrawler.gecco.GeccoEngine;
import com.geccocrawler.gecco.request.HttpGetRequest;
public class Application {
public static Map CITY_CACHE = new HashMap();
public static Map CITY_LOCATION_CACHE = new HashMap();
public static String URL = "http://www.mayi.com";
public static void main(String[] args) throws Exception {
//初始化城市数据
//initCity();
//加载城市
loadCityToMap();
//初始化城市位置类型
//initCityLocation();
//加载城市位置类型
loadCityLocationToMap();
//先获取分类列表
init();
}
private static void init() {
String[] cityNames = new String[] {"ABCD","EFGH","JKLM","NPQR","STW","XYZ"};
Map allCitys = MaYiAllCityConstant.ALL_CITY;
int i = 0;
while(true) {
if(MaYiAllCityConstant.on) {
MaYiAllCityConstant.on=false;
String cityName = cityNames[0];
String[] citys = allCitys.get(cityName);
for(String city : citys) {
String city_pin_yin = PinYinUtils.convertLower(city);
String nextUrl = URL+"/"+city_pin_yin;
System.out.println("MaYiIndexPipeline-->"+nextUrl);
startGecco(nextUrl);
}
i++;
}
if(i == cityNames.length-1) {
break;
}
}
}
private static void startGecco(String url) {
HttpGetRequest start = new HttpGetRequest(url);
start.setCharset("UTF-8");
GeccoEngine.create()
.classpath("org.ssgroup.spider")
//开始抓取的页面地址
.start(start)
//开启几个爬虫线程
.thread(1)
//.debug(true)
//单个爬虫每次抓取完一个请求后的间隔时间
.interval(5000)
.run();
}
private static void initCityLocation() throws Exception {
Connection conn = null;
PreparedStatement pstmt = null;
try {
String[] locations = MaYiAllCityConstant.CITY_LOCATION;
conn = JdbcUtils.getConnection();
conn.setAutoCommit(false);
String sql = "INSERT INTO city_location(name,pin_yin,parent_id) " +
"VALUES(?,?,?)";
pstmt = conn.prepareStatement(sql);
for(String location : locations) {
pstmt.setString(1, location);
pstmt.setString(2, PinYinUtils.convertLower(location));
pstmt.setInt(3, 0);
pstmt.addBatch();
}
pstmt.executeBatch();
conn.commit();
}catch (Exception e) {
e.printStackTrace();
conn.rollback();
}finally {
JdbcUtils.close(conn, pstmt, null);
}
}
private static void initCity() throws Exception {
Connection conn = null;
PreparedStatement pstmt = null;
try {
Map allCitys = MaYiAllCityConstant.ALL_CITY;
for(Entry entry : allCitys.entrySet()) {
if(!"holdCity".equals(entry.getKey())){
String[] citys = entry.getValue();
//保存城市数据
conn = JdbcUtils.getConnection();
conn.setAutoCommit(false);
String sql = "INSERT INTO city(name,pin_yin,first_pin_yin,first_last_pin_yin) " +
"VALUES(?,?,?,?)";
pstmt = conn.prepareStatement(sql);
for(String city : citys) {
pstmt.setString(1, city);
pstmt.setString(2, PinYinUtils.convertLower(city));
pstmt.setString(3, PinYinUtils.firstLetter(city).toLowerCase());
pstmt.setString(4, PinYinUtils.capitalizeLetter(city).toLowerCase());
pstmt.addBatch();
}
pstmt.executeBatch();
conn.commit();
}
}
}catch (Exception e) {
e.printStackTrace();
conn.rollback();
}finally {
JdbcUtils.close(conn, pstmt, null);
}
}
private static void loadCityToMap() throws Exception {
Connection conn = null;
PreparedStatement pstmt = null;
ResultSet resultSet = null;
try {
conn = JdbcUtils.getConnection();
String sql = "SELECT id,name,pin_yin,first_pin_yin,first_last_pin_yin FROM city";
pstmt = conn.prepareStatement(sql);
resultSet = pstmt.executeQuery();
while(resultSet.next()) {
Long id = resultSet.getLong("id");
String name = resultSet.getString("name");
String pinYin = resultSet.getString("pin_yin");
String firstPinYin = resultSet.getString("first_pin_yin");
String firstLastPinYin = resultSet.getString("first_last_pin_yin");
City city = new City();
city.setId(id);
city.setName(name);
city.setPinYin(pinYin);
city.setFirstPinYin(firstPinYin);
city.setFirstLastPinYin(firstLastPinYin);
CITY_CACHE.put(pinYin, city);
}
}catch (Exception e) {
e.printStackTrace();
}finally {
JdbcUtils.close(conn, pstmt, resultSet);
}
}
private static void loadCityLocationToMap() throws Exception {
Connection conn = null;
PreparedStatement pstmt = null;
ResultSet resultSet = null;
try {
conn = JdbcUtils.getConnection();
String sql = "SELECT id,name,pin_yin FROM city_location where parent_id=0";
pstmt = conn.prepareStatement(sql);
resultSet = pstmt.executeQuery();
while(resultSet.next()) {
Long id = resultSet.getLong("id");
String name = resultSet.getString("name");
String pinYin = resultSet.getString("pin_yin");
CityLocation cityLocation = new CityLocation();
cityLocation.setId(id);
cityLocation.setName(name);
cityLocation.setPinYin(pinYin);
CITY_LOCATION_CACHE.put(pinYin, cityLocation);
}
}catch (Exception e) {
e.printStackTrace();
}finally {
JdbcUtils.close(conn, pstmt, resultSet);
}
}
}
CREATE TABLE `city` (
`id` bigint(10) NOT NULL AUTO_INCREMENT,
`name` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市名称',
`pin_yin` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市拼音',
`first_pin_yin` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '首字母简写',
`first_last_pin_yin` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '首尾字母简写',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='城市表';
CREATE TABLE `city_location` (
`id` bigint(10) NOT NULL AUTO_INCREMENT,
`name` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市地段名称',
`pin_yin` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市地段拼音',
`city_id` bigint(10) DEFAULT NULL COMMENT '城市ID',
`city_name` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '城市名称',
`city_pin_yin` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '城市拼音',
`parent_id` bigint(10) DEFAULT NULL COMMENT '父ID',
`status` int(2) DEFAULT '0' COMMENT '城市拼音',
`href` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '请求路径',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='城市商业地段表';
CREATE TABLE `rooms` (
`id` bigint(10) NOT NULL,
`price` DOUBLE(10,2) DEFAULT NULL COMMENT '价格',
`house_location` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '房源地理位置',
`original_url` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '图片原始地址',
`image_url` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '缩略图地址',
`title` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '房源标题',
`num_room` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '几居室',
`num_house` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '可住几个人',
`city` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '属于哪个城市',
`city_id` bigint(10) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='房源表';