Python 爬虫学习 urllib

  1. 网页抓取
    # -*-coding: utf-8 -*-
    
    
    
    import urllib
    
    
    
    url = "http://www.cndzz.com/"
    
    
    
    html = urllib.urlopen(url)
    
    
    
    print html.read()
    
    

      对于网页编码为gb2312等格式的网页,使用如下方法

    # -*-coding: utf-8 -*-
    
    
    
    import urllib
    
    
    
    url = "http://www.sina.com.cn/"
    
    
    
    html = urllib.urlopen(url)
    
    
    
    print html.read().decode("gbk").encode("utf-8")
    
    

      如果有多种编码,可以使用如下方法

    # -*-coding: utf-8 -*-
    
    # Author:Evilxr
    
    
    
    import urllib
    
     
    
    url = "http://www.sina.com.cn/"
    
     
    
    html = urllib.urlopen(url)
    
     
    
    print html.read().decode("gbk", "ignore").encode("utf-8")
    
    

      

  2. 获取Web服务器头部信息
    # -*-coding: utf-8 -*-
    
    # Author:Evilxr
    
    
    
    import urllib
    
    
    
    url = "http://www.sina.com.cn/"
    
    
    
    html = urllib.urlopen(url)
    
    
    
    print html.info()
    
    

      返回信息:

    Server: nginx
    
    Date: Otc, 10 Nov 2014 12:54:50 GMT
    
    Content-Type: text/html
    
    Last-Modified: Otc, 10 Nov 2014 12:54:11 GMT
    
    Vary: Accept-Encoding
    
    Expires: Otc, 10 Nov 2014 12:55:50 GMT
    
    Cache-Control: max-age=60
    
    X-Powered-By: schi_v1.03
    
    Age: 27
    
    Content-Length: 563513
    
    X-Cache: HIT from cd31-151.sina.com.cn
    
    Connection: close
    
    
    
    
    
    [Finished in 0.2s]
    
    

      

  3. 获取网页状态码
    # -*-coding: utf-8 -*-
    
    # Author:Evilxr
    
    
    
    import urllib
    
    
    
    url = "http://www.sina.com.cn/"
    
    
    
    html = urllib.urlopen(url)
    
    
    
    # 200正常访问	301重定向	403 禁止访问 404页面不存在	500 服务器忙或者服务器无响应
    
    print html.getcode()
    
    
    
    # 获取用户传入的url
    
    print html.geturl()
    
    
    
    # 关闭文件
    
    html.close
    
    

      

  4. 保存网页内容
    # -*-coding: utf-8 -*-
    
    # Author:Evilxr
    
    
    
    import urllib
    
    
    
    url = "http://www.cdnzz.com/"
    
    
    
    urllib.urlretrieve(url, "d:\\evilxr.html")
    
    

      

  5. 获取网站编码类型
    # coding:utf8
    
    # Author:Evilxr
    
    
    
    import urllib
    
    
    
    url = "http://www.163.com"
    
    
    
    html = urllib.urlopen(url)
    
    
    
    print html.info().getparam('charset')
    
    html.close()
    
    

      返回:

    GBK
    
    [Finished in 0.6s]
    
    

      

    # coding:utf8
    
    # Author:Evilxr
    
    
    
    import urllib
    
    
    
    url = "http://www.cnblogs.com/Evilxr"
    
    
    
    html = urllib.urlopen(url)
    
    
    
    print html.info().getparam('charset')
    
    html.close()
    
    

      返回:

    utf-8
    
    [Finished in 0.3s]
    
    

      

  6. 自动获取网站编码 chardet[字符集检测]
    #先安装chardet
    
    #pip install chardet
    # coding:utf8
    
    
    
    import urllib 
    
    import chardet
    
    
    
    def automatic_detect(url):
    
    	"""" doc """
    
    	content = urllib.urlopen(url).read()
    
    	result= chardet.detect(content)
    
    	encoding = result['encoding']
    
    	return encoding
    
    
    
    url_list = ["http://www.sina.com.cn/", 
    
    			 "http://www.cnblogs.com/evilxr",
    
    			  "http://bbs.hackav.com/",
    
    			  "http://www.baidu.com/",
    
    			  "http://fuli.ba/"]
    
    for url in url_list:
    
    	print url, automatic_detect(url)
    http://www.sina.com.cn/ GB2312
    
    http://www.cnblogs.com/evilxr utf-8
    
    http://bbs.hackav.com/ GB2312
    
    http://www.baidu.com/ utf-8
    
    http://fuli.ba/ utf-8
    
    [Finished in 17.1s]
    
    

      

你可能感兴趣的:(python)