yangdelong

spider简单的爬虫程序

http://blog.csdn.net/wyymaomi/archive/2008/12/03/3439066.aspx

spider简单的爬虫程序

1、基础准备
htmlparser
首页：http://sourceforge.net/projects/htmlparser/
下载：http://sourceforge.net/project/showfiles.php?group_id=24399
文件：htmlparser1_6_20060610.zip
<dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>1.6</version>
</dependency>

cpdetector
首页：http://cpdetector.sourceforge.net/
下载：http://sourceforge.net/project/showfiles.php?group_id=114421
文件：cpdetector_eclipse_project_1.0.7.zip

<dependency>
<groupId>cpdetector</groupId>
<artifactId>cpdetector</artifactId>
<version>1.0.5</version>
</dependency>

spindle
首页：http://www.bitmechanic.com/projects/spindle/ （但是已经无法访问）

2 修改spindle代码得到的spider
简单的将URL打印出来了，解析的内容等等都没有处理

解析HTML的基类HtmlParserUtil.java

package com.sillycat.api.commons.utils.html;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.nio.charset.Charset;

import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;

import cpdetector.io.ASCIIDetector;
import cpdetector.io.CodepageDetectorProxy;
import cpdetector.io.JChardetFacade;
import cpdetector.io.ParsingDetector;
import cpdetector.io.UnicodeDetector;

public class HtmlParserUtil {

/* StringBuffer的缓冲区大小 */
public static int TRANSFER_SIZE = 4096;

/* 当前平台的行分隔符 */
public static String lineSep = System.getProperty("line.separator");

/* 自动探测页面编码，避免中文乱码的出现 */
public static String autoDetectCharset(URL url) {

   CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
   /**
   * ParsingDetector可用于检查HTML、XML等文件或字符流的编码构造方法中的参数用于指示是否显示探测过程的详细信息
   * 为false则不显示
   */
   detector.add(new ParsingDetector(false));
   detector.add(JChardetFacade.getInstance());
   detector.add(ASCIIDetector.getInstance());
   detector.add(UnicodeDetector.getInstance());

   Charset charset = null;
   try {
    charset = detector.detectCodepage(url);
   } catch (MalformedURLException mue) {
    mue.printStackTrace();
   } catch (IOException ie) {
    ie.printStackTrace();
   }
   if (charset == null)
    charset = Charset.defaultCharset();
   return charset.name();
}

/* 按照指定编码解析标准的html页面，为建立索引做准备 */
public static String[] parseHtml(String url, String charset) {

String result[] = null;
String content = null;

   try {
    URL source = new URL(url);
    InputStream in = source.openStream();
    BufferedReader reader = new BufferedReader(new InputStreamReader(
      in, charset));
    String line = new String();
    StringBuffer temp = new StringBuffer(TRANSFER_SIZE);
    while ((line = reader.readLine()) != null) {
     temp.append(line);
     temp.append(lineSep);
    }
    reader.close();
    in.close();
    content = temp.toString();
   } catch (UnsupportedEncodingException uee) {
    uee.printStackTrace();
   } catch (MalformedURLException mue) {
    System.err.println("Invalid URL : " + url);
   } catch (UnknownHostException uhe) {
    System.err.println("UnknowHost : " + url);
   } catch (SocketException se) {
    System.err.println("Socket Error : " + se.getMessage() + " " + url);
   } catch (SocketTimeoutException ste) {
    System.err.println("Socket Connection Time Out : " + url);
   } catch (FileNotFoundException fnfe) {
    System.err.println("broken link "
      + ((FileNotFoundException) fnfe.getCause()).getMessage()
      + " ignored");
   } catch (IOException ie) {
    ie.printStackTrace();
   }

   if (content != null) {
    Parser myParser = Parser.createParser(content, charset);
    HtmlPage visitor = new HtmlPage(myParser);
    try {
     myParser.visitAllNodesWith(visitor);
     String body = null;
     String title = "Untitled";
     if (visitor.getBody() != null) {
      NodeList nodelist = visitor.getBody();
      body = nodelist.asString().trim();
     }
     if (visitor.getTitle() != null){
      title = visitor.getTitle();
     }
     result = new String[] { body, title };
    } catch (ParserException pe) {
     pe.printStackTrace();
    }
   }
   return result;
}
}

多线程爬虫类 HtmlCaptureRunner.java

package com.sillycat.api.thread.runner;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import com.sillycat.api.commons.utils.StringUtil;
import com.sillycat.api.commons.utils.html.HtmlParserUtil;

public class HtmlCaptureRunner implements Runnable {

public Log logger = LogFactory.getLog(getClass());

/* 基准(初始)URL */
protected String baseURL = null;

private String contentPath = null;

/**
* 待解析的URL地址集合，所有新检测到的链接均存放于此；解析时按照先入先出（First-In First-Out）法则线性取出
*/
protected ArrayList URLs = new ArrayList();

/* 已存储的URL地址集合，避免链接的重复抓取 */
protected HashSet indexedURLs = new HashSet();

protected Parser parser = new Parser();;

/* 程序运行线程数，默认2个线程 */
protected int threads = 2;

/* 解析页面时的字符编码 */
protected String charset;

/* 基准端口 */
protected int basePort;

/* 基准主机 */
protected String baseHost;

/* 是否存储,默认true */
protected boolean justDatabase = true;

/* 检测索引中是否存在当前URL信息,避免重复抓取 */
protected boolean isRepeatedCheck = false;

public HtmlCaptureRunner() {
   PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
   factory.registerTag(new LocalLinkTag());
   factory.registerTag(new LocalFrameTag());
   factory.registerTag(new LocalBaseHrefTag());
   parser.setNodeFactory(factory);
}

public void capture() {
URLs.clear();
URLs.add(getBaseURL());

int responseCode = 0;
String contentType = "";

   try {
    HttpURLConnection uc = (HttpURLConnection) new URL(baseURL)
      .openConnection();
    responseCode = uc.getResponseCode();
    contentType = uc.getContentType();
   } catch (MalformedURLException mue) {
    logger.error("Invalid URL : " + getBaseURL());
   } catch (UnknownHostException uhe) {
    logger.error("UnknowHost : " + getBaseURL());
   } catch (SocketException se) {
    logger.error("Socket Error : " + se.getMessage() + " "
      + getBaseURL());
   } catch (IOException ie) {
    logger.error("IOException : " + ie);
   }

   if (responseCode == HttpURLConnection.HTTP_OK
     && contentType.startsWith("text/html")) {
    try {
     charset = HtmlParserUtil.autoDetectCharset(new URL(baseURL));

     basePort = new URL(baseURL).getPort();
     baseHost = new URL(baseURL).getHost();
     if (charset.equals("windows-1252"))
      charset = "GBK";

     long start = System.currentTimeMillis();
     ArrayList threadList = new ArrayList();
     for (int i = 0; i < threads; i++) {
      Thread t = new Thread(this, "Spider Thread #" + (i + 1));
      t.start();
      threadList.add(t);
     }
     while (threadList.size() > 0) {
      Thread child = (Thread) threadList.remove(0);
      try {
       child.join();
      } catch (InterruptedException ie) {
       logger.error("InterruptedException : " + ie);
      }
     }
     // for (int i = 0; i < threads; i++) {
     // threadPool.getThreadPoolExcutor().execute(new
     // Thread(this,"Spider Thread #" + (i + 1)));
     // }
     long elapsed = System.currentTimeMillis() - start;
     logger.info("Finished in " + (elapsed / 1000) + " seconds");
     logger.info("The Count of the Links Captured is "
       + indexedURLs.size());
    } catch (MalformedURLException e) {
     e.printStackTrace();
    }
   }
}

public void run() {
   String url;
   while ((url = dequeueURL()) != null) {
    if (justDatabase) {
     process(url);
    }
   }
   threads--;
}

/**
* 处理单独的URL地址，解析页面并加入到lucene索引中；通过自动探测页面编码保证抓取工作的顺利执行
*/
protected void process(String url) {

   String result[];
   String content = null;
   String title = null;

   result = HtmlParserUtil.parseHtml(url, charset);
   content = result[0];
   title = result[1];

   if (content != null && content.trim().length() > 0) {
    // content
    System.out.println(url);
    // title
    // DateTools.timeToString(System.currentTimeMillis()
   }
}

/* 从URL队列mPages里取出单个的URL */
public synchronized String dequeueURL() {
   while (true)
    if (URLs.size() > 0) {
     String url = (String) URLs.remove(0);
     indexedURLs.add(url);
     if (isToBeCaptured(url)) {
      NodeList list;
      try {
       int bookmark = URLs.size();
       /* 获取页面所有节点 */
       parser.setURL(url);
       try {
        list = new NodeList();
        for (NodeIterator e = parser.elements(); e
          .hasMoreNodes();)
         list.add(e.nextNode());
       } catch (EncodingChangeException ece) {
        /* 解码出错的异常处理 */
        parser.reset();
        list = new NodeList();
        for (NodeIterator e = parser.elements(); e
          .hasMoreNodes();)
         list.add(e.nextNode());
       }
       /**
       * 依据 http://www.robotstxt.org/wc/meta-user.html 处理
       * Robots <META> tag
       */
       NodeList robots = list
         .extractAllNodesThatMatch(
           new AndFilter(new NodeClassFilter(
             MetaTag.class),
             new HasAttributeFilter("name",
               "robots")), true);
       if (0 != robots.size()) {
        MetaTag robot = (MetaTag) robots.elementAt(0);
        String content = robot.getAttribute("content")
          .toLowerCase();
        if ((-1 != content.indexOf("none"))
          || (-1 != content.indexOf("nofollow")))
         for (int i = bookmark; i < URLs.size(); i++)
          URLs.remove(i);
       }
      } catch (ParserException pe) {
       logger.error("ParserException : " + pe);
      }
      return url;
     }
    } else {
     threads--;
     if (threads > 0) {
      try {
       wait();
       threads++;
      } catch (InterruptedException ie) {
       logger.error("InterruptedException : " + ie);
      }
     } else {
      notifyAll();
      return null;
     }
    }
}

private boolean isHTML(String url) {
   if (!url.endsWith(".html")) {
    return false;
   }
   if (StringUtil.isNotBlank(contentPath)) {
    if (!url.startsWith(baseURL + "/" + contentPath)) {
     return false;
    }
   }
   return true;
}

/**
* 判断提取到的链接是否符合解析条件；标准为Port及Host与基准URL相同且类型为text/html或text/plain
*/
public boolean isToBeCaptured(String url) {

boolean flag = false;

   HttpURLConnection uc = null;
   int responseCode = 0;
   String contentType = "";
   String host = "";
   int port = 0;

   try {
    URL source = new URL(url);
    String protocol = source.getProtocol();
    if (protocol != null && protocol.equals("http")) {
     host = source.getHost();
     port = source.getPort();
     uc = (HttpURLConnection) source.openConnection();
     uc.setConnectTimeout(8000);
     responseCode = uc.getResponseCode();
     contentType = uc.getContentType();
    }
   } catch (MalformedURLException mue) {
    logger.error("Invalid URL : " + url);
   } catch (UnknownHostException uhe) {
    logger.error("UnknowHost : " + url);
   } catch (SocketException se) {
    logger.error("Socket Error : " + se.getMessage() + " " + url);
   } catch (SocketTimeoutException ste) {
    logger.error("Socket Connection Time Out : " + url);
   } catch (FileNotFoundException fnfe) {
    logger.error("broken link " + url + " ignored");
   } catch (IOException ie) {
    logger.error("IOException : " + ie);
   }
   if (port == basePort
     && responseCode == HttpURLConnection.HTTP_OK
     && host.equals(baseHost)
     && (contentType.startsWith("text/html") || contentType
       .startsWith("text/plain")))
    flag = true;
   return flag;
}

class LocalLinkTag extends LinkTag {
   public void doSemanticAction() {
    String link = getLink();
    if (link.endsWith("/"))
     link = link.substring(0, link.length() - 1);
    int pos = link.indexOf("#");
    if (pos != -1)
     link = link.substring(0, pos);
    /* 将链接加入到处理队列中 */
    if (!(indexedURLs.contains(link) || URLs.contains(link))) {
     if (isHTML(link)) {
      URLs.add(link);
     }
    }
    setLink(link);
   }
}

/**
* Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local
* targets if they match the source.
*/
class LocalFrameTag extends FrameTag {
   public void doSemanticAction() {
    String link = getFrameLocation();
    if (link.endsWith("/"))
     link = link.substring(0, link.length() - 1);
    int pos = link.indexOf("#");
    if (pos != -1)
     link = link.substring(0, pos);
    /* 将链接加入到处理队列中 */
    if (!(indexedURLs.contains(link) || URLs.contains(link))) {
     if (isHTML(link)) {
      URLs.add(link);
     }
    }
    setFrameLocation(link);
   }
}

/**
* Base tag that doesn't show. The toHtml() method is overridden to return
* an empty string, effectively shutting off the base reference.
*/
class LocalBaseHrefTag extends BaseHrefTag {
   public String toHtml() {
    return ("");
   }
}

public String getBaseURL() {
return baseURL;
}

public void setBaseURL(String baseURL) {
this.baseURL = baseURL;
}

public int getThreads() {
return threads;
}

public void setThreads(int threads) {
this.threads = threads;
}

public String getCharset() {
return charset;
}

public void setCharset(String charset) {
this.charset = charset;
}

public int getBasePort() {
return basePort;
}

public void setBasePort(int basePort) {
this.basePort = basePort;
}

public String getBaseHost() {
return baseHost;
}

public void setBaseHost(String baseHost) {
this.baseHost = baseHost;
}

public boolean isJustDatabase() {
return justDatabase;
}

public void setJustDatabase(boolean justDatabase) {
this.justDatabase = justDatabase;
}

public String getContentPath() {
return contentPath;
}

public void setContentPath(String contentPath) {
this.contentPath = contentPath;
}

}

spring上的配置文件applicationContext-bean.xml:
<bean id="productCapture"
   class="com.sillycat.api.thread.runner.HtmlCaptureRunner" >
   <property name="contentPath" value="${product.contentPath}" />
   <property name="basePort" value="${product.base.port}" />
   <property name="baseURL" value="${product.base.url}" />
   <property name="charset" value="${product.base.code}" />
   <property name="threads" value="${product.base.threads}"/>
</bean>

easySearch.properties配置文件：
#==========================================
# spider configration
#=========================================
product.contentPath=product
product.base.port=80
product.base.url=http://www.safedv.com
product.base.code=UTF-8
product.base.threads=3

message.contentPath=message
message.base.port=80
message.base.url=http://www.safedv.com
message.base.code=UTF-8
message.base.threads=3

单元测试类HtmlRunnerTest.java文件：

package com.sillycat.api.thread;

import com.sillycat.api.commons.base.BaseManagerTest;
import com.sillycat.api.thread.runner.HtmlCaptureRunner;

public class HtmlRunnerTest extends BaseManagerTest {

private HtmlCaptureRunner productCapture;

private HtmlCaptureRunner messageCapture;

protected void setUp() throws Exception {
   super.setUp();
   productCapture = (HtmlCaptureRunner) appContext.getBean("productCapture");
   messageCapture = (HtmlCaptureRunner) appContext.getBean("messageCapture");
}

protected void tearDown() throws Exception {
super.tearDown();
}

public void testDumy() {
assertTrue(true);
}

public void ntestProductCapture() {
productCapture.capture();
}

public void testMessageCapture(){
messageCapture.capture();
}
}

本文来自CSDN博客，转载请标明出处：http://blog.csdn.net/wyymaomi/archive/2008/12/03/3439066.aspx

一个简单的java网络爬虫(spider)

http://blog.csdn.net/wyymaomi/archive/2008/12/03/3439016.aspx

一个简单的java网络爬虫,由于时间原因,没有进一步解释.

需要的htmlparser.jar包到官方网上去下.

---------------------------------------------Spider.java-----------------------------------------------------------------

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import org.htmlparser.RemarkNode;
import org.htmlparser.StringNode;
import org.htmlparser.Node;
import org.htmlparser.tags.*;
import org.htmlparser.Parser;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import java.util.Queue;
import java.util.LinkedList;

public class Spider implements Runnable {

boolean search_key_words = false;

int count = 0;

int limitsite = 10;

int countsite = 1;

String keyword = "中国";//搜索关键字

Parser parser = new Parser();

// List linklist = new ArrayList();
String startsite = "";//搜索的其实站点

SearchResultBean srb;//保存搜索结果

List resultlist = new ArrayList();//搜索到关键字链接列表

List searchedsite = new ArrayList();//已经被搜索站点列表

Queue linklist = new LinkedList();//需解析的链接列表

HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();

public Spider(String keyword, String startsite) {
   this.keyword = keyword;
   this.startsite = startsite;
   linklist.add(startsite);
   srb = new SearchResultBean();
}

public void run() {
// TODO Auto-generated method stub
search(linklist);

}

public void search(Queue queue) {
   String url = "";
     while(!queue.isEmpty()){
    url = queue.peek().toString();//查找列队
    try {
     if (!isSearched(searchedsite, url)) {
      if (isRobotAllowed(new URL(url)))//检查该链接是否被允许搜索
       processHtml(url);
      else
       System.out.println("this page is disallowed to search");
     }
    } catch (Exception ex) {

    }
    queue.remove();

     }

}
/**
* 解析HTML
* @param url
* @throws ParserException
* @throws Exception
*/
public void processHtml(String url) throws ParserException, Exception {
   searchedsite.add(url);
   count = 0;
   System.out.println("searching ... :" + url);
   parser.setURL(url);
   parser.setEncoding("GBK");
   URLConnection uc = parser.getConnection();
   uc.connect();
   //uc.getLastModified();
   NodeIterator nit = parser.elements();

   while (nit.hasMoreNodes()) {
    Node node = nit.nextNode();
    parserNode(node);
   }
   srb.setKeywords(keyword);
   srb.setUrl(url);
   srb.setCount_key_words(count);
   resultlist.add(srb);
   System.out.println("count keywords is :" + count);
   System.out.println("----------------------------------------------");
}
/**
* 处理HTML标签
* @param tag
* @throws Exception
*/
public void dealTag(Tag tag) throws Exception {
   NodeList list = tag.getChildren();
   if (list != null) {
    NodeIterator it = list.elements();
    while (it.hasMoreNodes()) {
     Node node = it.nextNode();
     parserNode(node);
    }
   }
}
/**
* 处理HTML标签结点
* @param node
* @throws Exception
*/
    public void parserNode(Node node) throws Exception{
    if (node instanceof StringNode) {//判断是否是文本结点
    StringNode sNode = (StringNode) node;
    StringFilter sf = new StringFilter(keyword,false);
    search_key_words = sf.accept(sNode);
    if (search_key_words) {
     count++;
    }
    // System.out.println("text is :"+sNode.getText().trim());
   } else if (node instanceof Tag) {//判断是否是标签库结点
    Tag atag = (Tag) node;
    if (atag instanceof TitleTag) {//判断是否是标TITLE结点
     srb.setTitle(atag.getText());
    }
    if (atag instanceof LinkTag) {//判断是否是标LINK结点
     LinkTag linkatag = (LinkTag) atag;
     checkLink(linkatag.getLink(), linklist);
     // System.out.println("-----------------this is link --------------");
    }
    dealTag(atag);
   } else if (node instanceof RemarkNode) {//判断是否是注释
    // System.out.println("this is remark");
   }
    }
    /*
     * 检查链接是否需要加入列队
     */
public void checkLink(String link, Queue queue) {
   if (link != null && !link.equals("") && link.indexOf("#") == -1) {
    if (!link.startsWith("http://") && !link.startsWith("ftp://")
      && !link.startsWith("www.")) {
     link = "file:///" + link;
    } else if (link.startsWith("www.")) {
     link = "http://" + link;
    }
    if (queue.isEmpty())
     queue.add(link);
    else {
     String link_end_=link.endsWith("/")?link.substring(0,link.lastIndexOf("/")):(link+"/");
     if (!queue.contains(link)&&!queue .contains(link_end_)) {
      queue.add(link);
     }
    }
   }
}
/**
* 检查该链接是否已经被扫描
* @param list
* @param url
* @return
*/
public boolean isSearched(List list, String url) {
   String url_end_ = "";
   if (url.endsWith("/")) {
    url_end_ = url.substring(0, url.lastIndexOf("/"));
   } else {
    url_end_ = url + "/";
   }
   if (list.size() > 0) {
    if (list.indexOf(url) != -1 || list.indexOf(url_end_) != -1) {
     return true;
    }
   }
   return false;
}
/**
* 检查URL是否被允许搜索
* @param urlToCheck
* @return
*/
private boolean isRobotAllowed(URL urlToCheck) {
   String host = urlToCheck.getHost().toLowerCase();// 获取给出RUL的主机
   // System.out.println("主机="+host);

// 获取主机不允许搜索的URL缓存
ArrayList<String> disallowList = disallowListCache.get(host);

   // 如果还没有缓存,下载并缓存。
   if (disallowList == null) {
    disallowList = new ArrayList<String>();
    try {
     URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
     BufferedReader reader = new BufferedReader(
       new InputStreamReader(robotsFileUrl.openStream()));

     // 读robot文件，创建不允许访问的路径列表。
     String line;
     while ((line = reader.readLine()) != null) {
      if (line.indexOf("Disallow:") == 0) {// 是否包含"Disallow:"
       String disallowPath = line.substring("Disallow:"
         .length());// 获取不允许访问路径

       // 检查是否有注释。
       int commentIndex = disallowPath.indexOf("#");
       if (commentIndex != -1) {
        disallowPath = disallowPath.substring(0,
          commentIndex);// 去掉注释
       }

       disallowPath = disallowPath.trim();
       disallowList.add(disallowPath);
      }
     }
     for (Iterator it = disallowList.iterator(); it.hasNext();) {
      System.out.println("Disallow is :" + it.next());
     }
     // 缓存此主机不允许访问的路径。
     disallowListCache.put(host, disallowList);
    } catch (Exception e) {
     return true; // web站点根目录下没有robots.txt文件,返回真
    }
   }

   String file = urlToCheck.getFile();
   // System.out.println("文件getFile()="+file);
   for (int i = 0; i < disallowList.size(); i++) {
    String disallow = disallowList.get(i);
    if (file.startsWith(disallow)) {
     return false;
    }
   }

return true;
}

public static void main(String[] args) {

   Spider ph = new Spider("英超", "http://www.microsoft.com");
   try {
    // ph.processHtml();
    Thread search = new Thread(ph);
    search.start();//启动线程
   } catch (Exception ex) {

}

}
}

--------------------------------------SearchResultBean.java---------------------------------------------------------

public class SearchResultBean {
   String url = "";
   String title = "";
   String keywords = "";
   int count_key_words = 0;
public int getCount_key_words() {
return count_key_words;
}
public void setCount_key_words(int count_key_words) {
this.count_key_words = count_key_words;
}
public String getKeywords() {
return keywords;
}
public void setKeywords(String keywords) {
this.keywords = keywords;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}

本文来自CSDN博客，转载请标明出处：http://blog.csdn.net/wyymaomi/archive/2008/12/03/3439016.aspx

你可能感兴趣的:(thread,exception,String,null,url,import)

LocalDateTime 转 String igotyback java 开发语言
importjava.time.LocalDateTime;importjava.time.format.DateTimeFormatter;publicclassMain{publicstaticvoidmain(String[]args){//获取当前时间LocalDateTimenow=LocalDateTime.now();//定义日期格式化器DateTimeFormatterformat
ArcGIS栅格计算器常见公式（赋值、0和空值的转换、补充栅格空值）研学随笔 arcgis 经验分享
我们在使用ArcGIS时通常经常用到栅格计算器，今天主要给大家介绍我日常中经常用到的几个公式，供大家参考学习。将特定值（-9999）赋值为0，例如-9999.Con("raster"==-9999,0,"raster")2.给空值赋予特定的值（如0）Con(IsNull("raster"),0,"raster")3.将特定的栅格值(如1)赋值为空值，其他保留原值SetNull("raster"==
每日一题——第九十题互联网打工人no1 C语言程序设计每日一练 c语言
题目：判断子串是否与主串匹配#include#include#include//////判断子串是否在主串中匹配//////主串///子串///boolisSubstring(constchar*str,constchar*substr){intlenstr=strlen(str);//计算主串的长度intlenSub=strlen(substr);//计算子串的长度//遍历主字符串，对每个可能得
C#中使用split分割字符串互联网打工人no1 c#
1、用字符串分隔：usingSystem.Text.RegularExpressions;stringstr="aaajsbbbjsccc";string[]sArray=Regex.Split(str,"js",RegexOptions.IgnoreCase);foreach(stringiinsArray)Response.Write(i.ToString()+"");输出结果：aaabbbc
python os.environ_python os.environ 读取和设置环境变量 weixin_39605414 python os.environ
>>>importos>>>os.environ.keys()['LC_NUMERIC','GOPATH','GOROOT','GOBIN','LESSOPEN','SSH_CLIENT','LOGNAME','USER','HOME','LC_PAPER','PATH','DISPLAY','LANG','TERM','SHELL','J2REDIR','LC_MONETARY','QT_QPA
Git常用命令－修改远程仓库地址猿大师 Linux Java git java
查看远程仓库地址gitremote-v返回结果originhttps://git.coding.net/＊＊＊＊＊.git(fetch)originhttps://git.coding.net/＊＊＊＊＊.git(push)修改远程仓库地址gitremoteset-urloriginhttps://git.coding.net/＊＊＊＊＊.git先删除后增加远程仓库地址gitremotermori
python是什么意思中文-在python中%是什么意思编程大乐趣
Python中%有两种：1、数值运算：%代表取模，返回除法的余数。如：>>>7%212、%操作符（字符串格式化，stringformatting），说明如下：%[(name)][flags][width].[precision]typecode(name)为命名flags可以有+，-，''或0。+表示右对齐。-表示左对齐。''为一个空格，表示在正数的左侧填充一个空格，从而与负数对齐。0表示使用0填
webpack图片等资源的处理 dmengmeng
需要的loaderfile-loader（让我们可以引入这些资源文件）url-loader（其实是file-loader的二次封装）img-loader（处理图片所需要的）在没有使用任何处理图片的loader之前，比如说css中用到了背景图片，那么最后打包会报错的，因为他没办法处理图片。其实你只想能够使用图片的话。只加一个file-loader就可以，打开网页能准确看到图片。{test:/\.(p
python os 环境变量 CV矿工 python 开发语言 numpy
环境变量：环境变量是程序和操作系统之间的通信方式。有些字符不宜明文写进代码里，比如数据库密码，个人账户密码，如果写进自己本机的环境变量里，程序用的时候通过os.environ.get（）取出来就行了。os.environ是一个环境变量的字典。环境变量的相关操作importos"""设置/修改环境变量：os.environ[‘环境变量名称’]=‘环境变量值’#其中key和value均为string类
Redis系列：Geo 类型赋能亿级地图位置计算 Ly768768 redis bootstrap 数据库
1前言我们在篇深刻理解高性能Redis的本质的时候就介绍过Redis的几种基本数据结构，它是基于不同业务场景而设计的：动态字符串(REDIS_STRING)：整数(REDIS_ENCODING_INT)、字符串(REDIS_ENCODING_RAW)双端列表(REDIS_ENCODING_LINKEDLIST)压缩列表(REDIS_ENCODING_ZIPLIST)跳跃表(REDIS_ENCODI
C++ | Leetcode C++题解之第409题最长回文串 Ddddddd_158 经验分享 C++Leetcode 题解
题目：题解：classSolution{public:intlongestPalindrome(strings){unordered_mapcount;intans=0;for(charc:s)++count[c];for(autop:count){intv=p.second;ans+=v/2*2;if(v%2==1andans%2==0)++ans;}returnans;}};
Some jenkins settings SnC_
Jenkins连接到特定gitlabproject的特定branch我采用的方法是在pipeline的script中使用git命令来指定branch。如下：stage('Clonerepository'){steps{gitbranch:'develop',credentialsId:'gitlab-credential-id',url:'http://gitlab.com/repo.git'}}
推荐算法_隐语义-梯度下降 _feivirus_ 算法机器学习和数学推荐算法机器学习隐语义
importnumpyasnp1.模型实现"""inputrate_matrix:M行N列的评分矩阵，值为P*Q.P:初始化用户特征矩阵M*K.Q:初始化物品特征矩阵K*N.latent_feature_cnt:隐特征的向量个数max_iteration:最大迭代次数alpha:步长lamda:正则化系数output分解之后的P和Q"""defLFM_grad_desc(rate_matrix,l
自然语言处理_tf-idf _feivirus_ 算法机器学习和数学自然语言处理 tf-idf 逆文档频率词频
importpandasaspdimportmath1.数据预处理docA="Thecatsatonmyface"docB="Thedogsatonmybed"wordsA=docA.split("")wordsB=docB.split("")wordsSet=set(wordsA).union(set(wordsB))print(wordsSet){'on','my','face','sat',
K近邻算法_分类鸢尾花数据集 _feivirus_ 算法机器学习和数学分类机器学习 K近邻
importnumpyasnpimportpandasaspdfromsklearn.datasetsimportload_irisfromsklearn.model_selectionimporttrain_test_splitfromsklearn.metricsimportaccuracy_score1.数据预处理iris=load_iris()df=pd.DataFrame(data=ir
用Python实现简单的猜数字游戏程序媛了了 python 游戏 java
猜数字游戏代码：importrandomdefpythonit():a=random.randint(1,100)n=int(input("输入你猜想的数字："))whilen!=a:ifn>a:print("很遗憾，猜大了")n=int(input("请再次输入你猜想的数字："))elifna::如果玩家猜的数字n大于随机数字a，则输出"很遗憾，猜大了"，并提示玩家再次输入。elifn
用Python实现读取统计单词个数程序媛了了 python 游戏 java
完整实例代码：fromcollectionsimportCounterdefpythonit():danci={}withopen("pythonit.txt","r",encoding="utf-8")asf:foriinf:words=i.strip().split()forwordinwords:ifwordnotindanci:danci[word]=1else:danci[word]+=
2024.9.6 Python，华为笔试题总结，字符串格式化，字符串操作，广度优先搜索解决公司组织绩效互评问题，无向图 RaidenQ python 华为 leetcode 算法力扣广度优先无向图
1.字符串格式化name="Alice"age=30formatted_string="Name:{},Age:{}".format(name,age)print(formatted_string)或者name="Alice"age=30formatted_string=f"Name:{name},Age:{age}"print(formatted_string)2.网络健康检查第一行有两个整数m
python多线程程序设计之一 IT_Beijing_BIT #Python 程序设计语言 python
python多线程程序设计之一全局解释器锁线程APIsthreading.active_count()threading.current_thread()threading.excepthook(args,/)threading.get_native_id()threading.main_thread()threading.stack_size([size])线程对象成员函数构造器start/ru
ArrayList 源码解析程序猿进阶 Java基础 ArrayList List java 面试性能优化架构设计 idea
ArrayList是Java集合框架中的一个动态数组实现，提供了可变大小的数组功能。它继承自AbstractList并实现了List接口，是顺序容器，即元素存放的数据与放进去的顺序相同，允许放入null元素，底层通过数组实现。除该类未实现同步外，其余跟Vector大致相同。每个ArrayList都有一个容量capacity，表示底层数组的实际大小，容器内存储元素的个数不能多于当前容量。当向容器中添
python tif转png Python与遥感 python 开发语言
importosfromosgeoimportgdalimportnumpyasnpfromPILimportImage#提取432三波段fromspectralimport*#输入文件夹路径defget_img(dataset_img):width=dataset_img.RasterXSize#获取行列数height=dataset_img.RasterYSizebands=dataset_i
python怎么将png转为tif_png转tif weixin_39977276
发国外的文章要求图片是tif，cmyk色彩空间的。大小尺寸还有要求。比如网上大神多，找到了一段代码，感谢！https://www.jianshu.com/p/ec2af4311f56https://github.com/KevinZc007/image2Tifimportjava.awt.image.BufferedImage;importjava.io.File;importjava.io.Fi
python批量读取tiff文件_Python Pillow批量转换tif格式到jpg weixin_39557797
最近因为想要整下网站的壁纸，从网站下载了别人整理好的合集压缩包，解压之后，却发现里面的文件都是tif的，tif格式网站和电脑都不认的，根本不能作壁纸。这时候，就需要转换图片格式了，首先我找了几款转换格式的软件，发现效果都不好，要不是不支持tif格式，要不就是转换出来的图片糊的不行。最终，还是决定用Python的Pillow库来写一个脚本，完成这个任务。下面是整个的小脚本----importosim
react-intl——react国际化使用方案苹果酱0567 面试题汇总与解析 java 开发语言中间件 spring boot 后端
国际化介绍i18n：internationalization国家化简称，首字母+首尾字母间隔的字母个数+尾字母，类似的还有k8s(Kubernetes)React-intl是React中最受欢迎的库。使用步骤安装#usenpmnpminstallreact-intl-D#useyarn项目入口文件配置//index.tsximportReactfrom"react";importReactDOMf
tiff批量转png 诺有缸的高飞鸟 opencv 图像处理 python opencv 图像处理
目录写在前面代码完写在前面1、本文内容tiff批量转png2、平台/环境opencv,python3、转载请注明出处：https://blog.csdn.net/qq_41102371/article/details/132975023代码importnumpyasnpimportcv2importosdeffindAllFile(base):file_list=[]forroot,ds,fsin
Spring Boot中实现跨域请求 BABA8891 spring boot 后端 java
在SpringBoot中实现跨域请求（CORS，Cross-OriginResourceSharing）可以通过多种方式，以下是几种常见的方法：1.使用@CrossOrigin注解在SpringBoot中，你可以在控制器或者具体的请求处理方法上使用@CrossOrigin注解来允许跨域请求。在控制器上应用：importorg.springframework.web.bind.annotation.
COCO 格式的数据集转化为 YOLO 格式的数据集 QYQY77 YOLO python
"""--json_path输入的json文件路径--save_path保存的文件夹名字，默认为当前目录下的labels。"""importosimportjsonfromtqdmimporttqdmimportargparseparser=argparse.ArgumentParser()parser.add_argument('--json_path',default='./instances
python结束子进程_如何清除python中的子进程 weixin_39995943 python结束子进程
我们使用python进程来管理长时间运行的python子进程。有时需要终止子进程。kill命令不会完全终止进程，只会使其失效。运行以下脚本将演示此行为。importsubprocessp=subprocess.Popen(['sleep','400'],stdout=subprocess.PIPE,shell=False)或者p=subprocess.Popen('sleep400',stdout
非对称加密算法原理与应用2——RSA私钥加密文件私语茶馆云部署与开发架构及产品灵感记录 RSA2048 私钥加密
作者：私语茶馆1.相关章节（1）非对称加密算法原理与应用1——秘钥的生成-CSDN博客第一章节讲述的是创建秘钥对，并将公钥和私钥导出为文件格式存储。本章节继续讲如何利用私钥加密内容，包括从密钥库或文件中读取私钥，并用RSA算法加密文件和String。2.私钥加密的概述本文主要基于第一章节的RSA2048bit的非对称加密算法讲述如何利用私钥加密文件。这种加密后的文件，只能由该私钥对应的公钥来解密。
leetcode-617. 合并二叉树 manba_ leetcode hot100 leetcode 算法
题目描述给你两棵二叉树：root1和root2。想象一下，当你将其中一棵覆盖到另一棵之上时，两棵树上的一些节点将会重叠（而另一些不会）。你需要将这两棵树合并成一棵新二叉树。合并的规则是：如果两个节点重叠，那么将这两个节点的值相加作为合并后节点的新值；否则，不为null的节点将直接作为新二叉树的节点。返回合并后的二叉树。注意:合并过程必须从两个树的根节点开始。示例1：输入：root1=[1,3,2,
redis学习笔记——不仅仅是存取数据 Everyday都不同 returnSource expire/del incr/lpush 数据库分区 redis
最近项目中用到比较多redis，感觉之前对它一直局限于get/set数据的层面。其实作为一个强大的NoSql数据库产品，如果好好利用它，会带来很多意想不到的效果。（因为我搞java，所以就从jedis的角度来补充一点东西吧。PS：不一定全，只是个人理解，不喜勿喷） 1、关于JedisPool.returnSource(Jedis jeids) 这个方法是从red
SQL性能优化-持续更新中。。。。。。 atongyeye oracle sql
1 通过ROWID访问表--索引你可以采用基于ROWID的访问方式情况,提高访问表的效率, , ROWID包含了表中记录的物理位置信息..ORACLE采用索引(INDEX)实现了数据和存放数据的物理位置(ROWID)之间的联系. 通常索引提供了快速访问ROWID的方法,因此那些基于索引列的查询就可以得到性能上的提高. 2 共享SQL语句--相同的sql放入缓存 3 选择最有效率的表
[JAVA语言]JAVA虚拟机对底层硬件的操控还不完善 comsci JAVA虚拟机
如果我们用汇编语言编写一个直接读写CPU寄存器的代码段，然后利用这个代码段去控制被操作系统屏蔽的硬件资源，这对于JVM虚拟机显然是不合法的，对操作系统来讲，这样也是不合法的，但是如果是一个工程项目的确需要这样做，合同已经签了，我们又不能够这样做，怎么办呢？那么一个精通汇编语言的那种X客，是否在这个时候就会发生某种至关重要的作用呢？ &n
lvs- real 男人50 LVS
#!/bin/bash # # Script to start LVS DR real server. # description: LVS DR real server # #. /etc/rc.d/init.d/functions VIP=10.10.6.252 host='/bin/hostname' case "$1" in sta
生成公钥和私钥 oloz DSA 安全加密
package com.msserver.core.util; import java.security.KeyPair; import java.security.PrivateKey; import java.security.PublicKey; import java.security.SecureRandom; public class SecurityUtil {
UIView 中加入的cocos2d，背景透明 374016526 cocos2d glClearColor
要点是首先pixelFormat:kEAGLColorFormatRGBA8，必须有alpha层才能透明。然后view设置为透明glView.opaque = NO;[director setOpenGLView:glView];[self.viewController.view setBackgroundColor:[UIColor clearColor]];[self.viewControll
mysql常用命令香水浓 mysql
连接数据库 mysql -u troy -ptroy 备份表 mysqldump -u troy -ptroy mm_database mm_user_tbl > user.sql 恢复表（与恢复数据库命令相同） mysql -u troy -ptroy mm_database < user.sql 备份数据库 mysqldump -u troy -ptroy
我的架构经验系列文章 - 后端架构 - 系统层面 agevs JavaScript jquery css html5
系统层面：高可用性所谓高可用性也就是通过避免单独故障加上快速故障转移实现一旦某台物理服务器出现故障能实现故障快速恢复。一般来说，可以采用两种方式，如果可以做业务可以做负载均衡则通过负载均衡实现集群，然后针对每一台服务器进行监控，一旦发生故障则从集群中移除；如果业务只能有单点入口那么可以通过实现Standby机加上虚拟IP机制，实现Active机在出现故障之后虚拟IP转移到Standby的快速
利用ant进行远程tomcat部署 aijuans tomcat
在javaEE项目中，需要将工程部署到远程服务器上，如果部署的频率比较高，手动部署的方式就比较麻烦，可以利用Ant工具实现快捷的部署。这篇博文详细介绍了ant配置的步骤（http://www.cnblogs.com/GloriousOnion/archive/2012/12/18/2822817.html），但是在tomcat7以上不适用，需要修改配置，具体如下： 1.配置tomcat的用户角色
获取复利总收入 baalwolf 获取
public static void main(String args[]){ int money=200; int year=1; double rate=0.1; &
eclipse.ini解释 BigBird2012 eclipse
大多数java开发者使用的都是eclipse，今天感兴趣去eclipse官网搜了一下eclipse.ini的配置，供大家参考，我会把关键的部分给大家用中文解释一下。还是推荐有问题不会直接搜谷歌，看官方文档，这样我们会知道问题的真面目是什么，对问题也有一个全面清晰的认识。 Overview 1、Eclipse.ini的作用 Eclipse startup is controlled by th
AngularJS实现分页功能 bijian1013 JavaScript AngularJS 分页
对于大多数web应用来说显示项目列表是一种很常见的任务。通常情况下，我们的数据会比较多，无法很好地显示在单个页面中。在这种情况下，我们需要把数据以页的方式来展示，同时带有转到上一页和下一页的功能。既然在整个应用中这是一种很常见的需求，那么把这一功能抽象成一个通用的、可复用的分页（Paginator）服务是很有意义的。 &nbs
[Maven学习笔记三]Maven archetype bit1129 ArcheType
archetype的英文意思是原型，Maven archetype表示创建Maven模块的模版，比如创建web项目，创建Spring项目等等. mvn archetype提供了一种命令行交互式创建Maven项目或者模块的方式， mvn archetype 1.在LearnMaven-ch03目录下，执行命令mvn archetype:gener
【Java命令三】jps bit1129 Java命令
jps很简单，用于显示当前运行的Java进程，也可以连接到远程服务器去查看 [hadoop@hadoop bin]$ jps -help usage: jps [-help] jps [-q] [-mlvV] [<hostid>] Definitions: <hostid>: <hostname>[:
ZABBIX2.2 2.4 等各版本之间的兼容性 ronin47
zabbix更新很快，从2009年到现在已经更新多个版本，为了使用更多zabbix的新特性，随之而来的便是升级版本，zabbix版本兼容性是必须优先考虑的一点客户端AGENT兼容 zabbix1.x到zabbix2.x的所有agent都兼容zabbix server2.4：如果你升级zabbix server，客户端是可以不做任何改变，除非你想使用agent的一些新特性。 Zabbix代理（p
unity 3d还是cocos2dx哪个适合游戏？ brotherlamp unity自学 unity教程 unity视频 unity资料 unity
unity 3d还是cocos2dx哪个适合游戏？问：unity 3d还是cocos2dx哪个适合游戏？答：首先目前来看unity视频教程因为是3d引擎，目前对2d支持并不完善，unity 3d 目前做2d普遍两种思路，一种是正交相机，3d画面2d视角，另一种是通过一些插件，动态创建mesh来绘制图形单元目前用的较多的是2d toolkit，ex2d，smooth moves，sm2，
百度笔试题：一个已经排序好的很大的数组，现在给它划分成m段，每段长度不定，段长最长为k，然后段内打乱顺序，请设计一个算法对其进行重新排序 bylijinnan java 算法面试百度招聘
import java.util.Arrays; /** * 最早是在陈利人老师的微博看到这道题： * #面试题#An array with n elements which is K most sorted，就是每个element的初始位置和它最终的排序后的位置的距离不超过常数K * 设计一个排序算法。It should be faster than O(n*lgn)。
获取checkbox复选框的值 chiangfai checkbox
<title>CheckBox</title> <script type = "text/javascript"> doGetVal: function doGetVal() { //var fruitName = document.getElementById("apple").value;//根据
MySQLdb用户指南 chenchao051 mysqldb
原网页被墙，放这里备用。 MySQLdb User's Guide Contents Introduction Installation _mysql MySQL C API translation MySQL C API function mapping Some _mysql examples MySQLdb
HIVE 窗口及分析函数 daizj hive 窗口函数分析函数
窗口函数应用场景：（1）用于分区排序（2）动态Group By （3）Top N （4）累计计算（5）层次查询一、分析函数用于等级、百分点、n分片等。函数说明 RANK() &nbs
PHP ZipArchive 实现压缩解压Zip文件 dcj3sjt126com PHP zip
PHP ZipArchive 是PHP自带的扩展类，可以轻松实现ZIP文件的压缩和解压，使用前首先要确保PHP ZIP 扩展已经开启，具体开启方法就不说了，不同的平台开启PHP扩增的方法网上都有，如有疑问欢迎交流。这里整理一下常用的示例供参考。一、解压缩zip文件 01 02 03 04 05 06 07 08 09 10 11
精彩英语贺词 dcj3sjt126com 英语
I'm always here 我会一直在这里支持你 &nb
基于Java注解的Spring的IoC功能 e200702084 java spring bean IOC Office
java模拟post请求 geeksun java
一般API接收客户端（比如网页、APP或其他应用服务）的请求，但在测试时需要模拟来自外界的请求，经探索，使用HttpComponentshttpClient可模拟Post提交请求。此处用HttpComponents的httpclient来完成使命。 import org.apache.http.HttpEntity ; import org.apache.http.HttpRespon
Swift语法之 ---- ?和!区别 hongtoushizi ?swift !
转载自： http://blog.sina.com.cn/s/blog_71715bf80102ux3v.html Swift语言使用var定义变量，但和别的语言不同，Swift里不会自动给变量赋初始值，也就是说变量不会有默认值，所以要求使用变量之前必须要对其初始化。如果在使用变量之前不进行初始化就会报错： var stringValue : String //
centos7安装jdk1.7 jisonami jdk centos
安装JDK1.7 步骤1、解压tar包在当前目录 [root@localhost usr]#tar -xzvf jdk-7u75-linux-x64.tar.gz 步骤2：配置环境变量在etc/profile文件下添加 export JAVA_HOME=/usr/java/jdk1.7.0_75 export CLASSPATH=/usr/java/jdk1.7.0_75/lib
数据源架构模式之数据映射器 home198979 PHP 架构数据映射器 datamapper
前面分别介绍了数据源架构模式之表数据入口、数据源架构模式之行和数据入口数据源架构模式之活动记录，相较于这三种数据源架构模式，数据映射器显得更加“高大上”。一、概念数据映射器（Data Mapper）：在保持对象和数据库（以及映射器本身）彼此独立的情况下，在二者之间移动数据的一个映射器层。概念永远都是抽象的，简单的说，数据映射器就是一个负责将数据映射到对象的类数据。 &nb
在Python中使用MYSQL pda158 mysql python
缘由　　近期在折腾一个小东西须要抓取网上的页面。然后进行解析。将结果放到数据库中。　　了解到 Python在这方面有优势，便选用之。　　由于我有台 server上面安装有 mysql，自然使用之。在进行数据库的这个操作过程中遇到了不少问题，这里记录一下，大家共勉。　　 python中mysql的调用　　百度之后能够通过MySQLdb进行数据库操作。
单例模式 hxl1988_0311 java 单例设计模式单件
package com.sosop.designpattern.singleton; /* * 单件模式：保证一个类必须只有一个实例，并提供全局的访问点 * * 所以单例模式必须有私有的构造器，没有私有构造器根本不用谈单件 * * 必须考虑到并发情况下创建了多个实例对象 * */ /** * 虽然有锁，但是只在第一次创建对象的时候加锁，并发时不会存在效率
27种迹象显示你应该辞掉程序员的工作 vipshichg 工作
1、你仍然在等待老板在2010年答应的要提拔你的暗示。 2、你的上级近10年没有开发过任何代码。 3、老板假装懂你说的这些技术，但实际上他完全不知道你在说什么。 4、你干完的项目6个月后才部署到现场服务器上。 5、时不时的，老板在检查你刚刚完成的工作时，要求按新想法重新开发。 6、而最终这个软件只有12个用户。 7、时间全浪费在办公室政治中，而不是用在开发好的软件上。 8、部署前5分钟才开始测试。