Java HttpURLConnection 抓取网页内容 解析gzip格式输入流数据并转换为String格式字符串

最近GFW为了刷存在感,搞得大家是头晕眼花,修改hosts 几乎成了每日必备工作。

 

索性写了一个小程序,给办公室的同事们分享,其中有个内容 就是抓取网络上的hosts,废了一些周折。

我是在一个博客上抓取的。但是这位朋友的博客应该是在做防盗链,但他的方式比较简单就是5位数的一个整形随机数。这里折腾一下就ok了。

要命的是他这个链接的流类型 居然是gzip。这个郁闷好久,一直以为是编码格式导致解析不出来结果,后来发现是gzip搞的。

 

主要的一段代码做个记录吧。

  1 /**

  2  * 网络工具类 用于抓取http://serve.netsh.org上的hosts数据

  3  *

  4  * @author tone

  5  */

  6 public class NetUtil {

  7 

  8     private final static String ENCODING = "UTF-8";

  9     private final static String GZIPCODING = "gzip";

 10     private final static String HOST = "http://serve.netsh.org/pub/hosts.php";

 11     private final static String COOKIE = "hostspasscode=%s; Hm_lvt_e26a7cd6079c926259ded8f19369bf0b=1421846509,1421846927,1421847015,1421849633; Hm_lpvt_e26a7cd6079c926259ded8f19369bf0b=1421849633";

 12     private final static String OFF = "off";

 13     private final static String ON = "on";

 14     private final static int RANDOM = 100000;

 15     private static String hostspasscode = null;

 16     private static NetUtil instance;

 17 

 18     public static NetUtil getInstance() {

 19         if (instance == null) {

 20             instance = new NetUtil();

 21         }

 22         return instance;

 23     }

 24 

 25     private NetUtil() {

 26         hostspasscode = createRandomCookies();

 27     }

 28 

 29     /**

 30      * 获取html内容

 31      *

 32      * @param gs

 33      * @param wk

 34      * @param twttr

 35      * @param fb

 36      * @param flkr

 37      * @param dpbx

 38      * @param odrvB

 39      * @param yt

 40      * @param nohl

 41      * @return

 42      */

 43     public String getHtmlInfo(boolean gs, boolean wk, boolean twttr, boolean fb,

 44             boolean flkr, boolean dpbx, boolean odrv,

 45             boolean yt, boolean nohl) throws Exception {

 46         HttpURLConnection conn = null;

 47 

 48         String result = "";

 49 

 50         //String cookie = "hostspasscode="+hostspasscode+"; Hm_lvt_e26a7cd6079c926259ded8f19369bf0b=1421846509,1421846927,1421847015,1421849633; Hm_lpvt_e26a7cd6079c926259ded8f19369bf0b=1421849633";

 51         String cookie = String.format(COOKIE, hostspasscode);

 52 

 53         //URL url = new URL("http://serve.netsh.org/pub/hosts.php?passcode=13008&gs=on&wk=on&twttr=on&fb=on&flkr=on&dpbx=on&odrv=on&yt=on&nolh=on");

 54         URL url = new URL(createUrl(hostspasscode, gs, wk, twttr, fb, flkr, dpbx, odrv, yt, nohl));

 55         //System.out.println(cookie);

 56        // System.out.println(url.toString());

 57         

 58        conn = (HttpURLConnection) url.openConnection();

 59 

 60         conn.setConnectTimeout(5 * 1000);

 61         conn.setDoOutput(true);

 62         //get方式提交

 63         conn.setRequestMethod("GET");

 64         //凭借请求头文件

 65         conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");

 66         conn.setRequestProperty("Accept-Encoding", "gzip, deflate");

 67         conn.setRequestProperty("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");

 68         conn.setRequestProperty("Connection", "keep-alive");

 69         conn.setRequestProperty("Cookie", cookie);

 70         conn.setRequestProperty("Host", "serve.netsh.org");

 71         conn.setRequestProperty("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0");

 72 

 73         // conn.setRequestProperty("Referer", "http://serve.netsh.org/pub/gethosts.php");    

 74         // conn.setRequestProperty("X-Requested-With", "XMLHttpRequest");  

 75        

 76         conn.connect();

 77         

 78         String encoding = conn.getContentEncoding();

 79         

 80         result = readStream(conn.getInputStream(), encoding);

 81         //测试进度条显示

 82         // result = readStream(new FileInputStream(new File("/home/tone/Resident.Evil.Damnation.2012.1080p.BluRay.x264.DTS-WiKi.mkv")), "11");

 83 

 84         conn.disconnect();

 85         if (nohl) {

 86              result=getLocalHost()+result;

 87         }

 88        

 89         return result;

 90     }

 91 

 92     /**

 93      * 读取将InputStream中的字节读以字符的形式取到字符串中,如果encoding是gzip,那么需要先有GZIPInputStream进行封装

 94      *

 95      * @param inputStream InputStream字节流

 96      * @param encoding 编码格式

 97      * @return String类型的形式

 98      * @throws IOException IO异常

 99      */

100     private String readStream(InputStream inputStream, String encoding) throws Exception {

101         StringBuffer buffer = new StringBuffer();

102         ProgressMonitorInputStream pmis = null;

103          

104         InputStreamReader inputStreamReader = null;

105         GZIPInputStream gZIPInputStream = null;

106         if (GZIPCODING.equals(encoding)) {

107             gZIPInputStream = new GZIPInputStream(inputStream);

108             inputStreamReader = new InputStreamReader(ProgressUtil.getMonitorInputStream(gZIPInputStream, "获取网络数据"), ENCODING);

109            

110         } else {

111             

112             inputStreamReader = new InputStreamReader(ProgressUtil.getMonitorInputStream(inputStream, "获取网络数据"), ENCODING);

113         }

114         

115         

116         char[] c = new char[1024];

117         

118         int lenI;

119         while ((lenI = inputStreamReader.read(c)) != -1) {

120             

121             buffer.append(new String(c, 0, lenI));

122             

123         }

124         if (inputStream != null) {

125             inputStream.close();

126         }

127         if (gZIPInputStream != null) {

128             gZIPInputStream.close();

129         }

130         if (pmis!=null) {

131             gZIPInputStream.close();

132         }

133         

134 

135         return buffer.toString();

136 

137 

138     }

139 

140     /**

141      * 生成随机Cookies数组

142      *

143      * @return 五位随机数字

144      */

145     private String createRandomCookies() {

146 

147         return String.valueOf(Math.random() * RANDOM).substring(0, 5);

148 

149     }

150 

151     /**

152      * 生成链接字符串

153      *

154      * @param hostspasscode

155      * @param gs

156      * @param wk

157      * @param twttr

158      * @param fb

159      * @param flkr

160      * @param dpbx

161      * @param odrvB

162      * @param yt

163      * @param nohl

164      * @return

165      */

166     private String createUrl(String hostspasscode, boolean gs, boolean wk, boolean twttr, boolean fb,

167             boolean flkr, boolean dpbx, boolean odrv,

168             boolean yt, boolean nohl) {

169         StringBuffer buffer = new StringBuffer();

170         buffer.append(HOST);

171         buffer.append("?passcode=" + hostspasscode);

172         if (gs) {

173             buffer.append("&gs=" + ON);

174         } else {

175             buffer.append("&gs=" + OFF);

176         }

177         if (wk) {

178             buffer.append("&wk=" + ON);

179         } else {

180             buffer.append("&wk=" + OFF);

181         }

182         if (twttr) {

183             buffer.append("&twttr=" + ON);

184         } else {

185             buffer.append("&twttr=" + OFF);

186         }

187         if (fb) {

188             buffer.append("&fb=" + ON);

189         } else {

190             buffer.append("&fb=" + OFF);

191         }

192         if (flkr) {

193             buffer.append("&flkr=" + ON);

194         } else {

195             buffer.append("&flkr=" + OFF);

196         }

197         if (dpbx) {

198             buffer.append("&dpbx=" + ON);

199         } else {

200             buffer.append("&dpbx=" + OFF);

201         }

202         if (odrv) {

203             buffer.append("&odrv=" + ON);

204         } else {

205             buffer.append("&odrv=" + OFF);

206         }

207         if (yt) {

208             buffer.append("&yt=" + ON);

209         } else {

210             buffer.append("&yt=" + OFF);

211         }

212         if (nohl) {

213             buffer.append("&nohl=" + ON);

214         } else {

215             buffer.append("&nohl=" + OFF);

216         }

217         return buffer.toString();

218     }

219 

220     private String getLocalHost() throws Exception {

221         

222         StringBuffer buffer=new StringBuffer();

223         String hostName=OSUtil.getInstance().getLocalhostName();

224         buffer.append("#LOCALHOST begin"+"\n");

225         buffer.append("127.0.0.1\tlocalhost"+"\n");

226         if (hostName!=null&&!"".equals(hostName)) {

227              buffer.append("127.0.1.1\t"+hostName+"\n");

228         }

229         

230         buffer.append("#LOCALHOST end"+"\n");

231         return  buffer.toString();

232        

233         

234         

235     }

236     

237 }

 

你可能感兴趣的:(Java HttpURLConnection 抓取网页内容 解析gzip格式输入流数据并转换为String格式字符串)