最简单的爬取网页内容
#coding=utf-8
import re
import urllib
# 读取url内容
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
html = getHtml("http://www.baidu.com/")
print html
简单实现二次跳转
在爬 https://www.baidu.com/的时候(这里多了一个s), 爬回来一个没有什么内容的东西, 这个东西告诉我们应该跳转到 http://www.baidu.com .以下代码可以简单地实现二次跳转.
#coding=utf-8
import re
import urllib
# 读取url内容
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getNewHtml(url):
html = getHtml(url)
keyList = re.findall(r"url=(.+?)\">",html)
website = keyList[0]
return getHtml(website)
html = getNewHtml("https://www.baidu.com/")
print html
注意:
1\如果这里没有返回全地址,则自己字符串拼接一下即可.
python的字符串拼接,十分方便,例如:
website = '%s%s%s' % ('http://xxx/yyyy/', keyList[0], '.jpg')
2\上面多次使用了变量名html,但并不会相互影响.因为python函数内部的变量名如果第一次出现,且出现在=前面,即被视为定义一个局部变量,不管全局域中有没有用到该变量名,函数中使用的将是局部变量
模拟浏览器爬取网页
# ------------
#! /usr/bin/env python2.7
import sys
import zlib
import urllib
import urllib2
import cookielib
def main():
reload( sys )
sys.setdefaultencoding('utf-8')
url = 'http://pythontab.com'
values = {
"form_field1":"value1",
"form_field2":"TRUE",
}
post_data = urllib.urlencode(values)
cj=cookielib.CookieJar()
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
# # mac用户
# headers ={"User-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0",
# "Referer":"http://xxx.yyy.com/test0",
# "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language":"en-US,en;q=0.5",
# "Accept-Encoding":"gzip, deflate",
# "Connection":"keep-alive",
# # "Cookie":"QSession=",
# "Content-Type":"application/x-www-form-urlencoded",
# }
# # 如果是window可以这么写
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept':'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding':'gzip',
'Connection':'close',
'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host
}
req = urllib2.Request(url,post_data,headers)
response = opener.open(req)
content = response.read()
gzipped = response.headers.get('Content-Encoding')
if gzipped:
html = zlib.decompress(content, 16+zlib.MAX_WBITS)
else:
html = content
print html.decode("utf8")
if __name__ == '__main__':
main()
关于python正则表达式
做爬虫经常需要用到正则表达式,匹配查找内容.
下面几段代码示范.
查找包含中文和英文的正则表达式
# 匹配查找包含中文和英文的函数
# -*- coding: utf-8 -*-
import re
def findPart(regex, text, name):
res=re.findall(regex, text)
if res:
print "There are %d %s parts:\n"% (len(res), name)
for r in res:
print "\t",r.encode("utf8")
# 应用例子1
text =u"#who#helloworld#a中文x#"
findPart(u"#[\w\u2E80-\u9FFF]+#", text, "unicode chinese")
# 输出
# There are 2 unicode chinese parts:
# #who## #a中文x#
# 应用例子2
text2 =u"#who#helloworld12a中文x3s"
findPart(u"12[\w\u2E80-\u9FFF]+3s", text2, "unicode chinese")
# 输出
# There are 1 unicode chinese parts:
# 12a中文x3s
限制中文字个数的查找
# -*- coding: utf-8 -*-
# 限制中文数量为1-2个的查找
import re
a = u"q这是个中文869一y9一二三886看啊ab"
b = re.compile(u"[\u4e00-\u9fa5]{1,2}")
c = b.findall(a)
for i in c:
print i
# #输出
#这是
#个中
#文
#一
#一二
#三
#看啊
查找英文和数字
# 获取英文和数字
import re
str = "a12yy...3b"
print re.findall(r"a(.+?)b",str)#
# 输出['12yy...3']
写正则表达式的时候注意转义字符,以免导致查找不到哦.
import re
# 截取英文及符号
context = "launchable-activity: name='com.bmi.Bmi'"
patt = re.compile(r"launchable-activity\:\s+name='(.*?)'")
ch = patt.findall(context)
print ch
# 输出['com.bmi.Bmi']
参考资料:
http://blog.chedushi.com/archives/8298 PYTHON 3开发网络爬虫
http://blog.csdn.net/column/details/why-bug.html <请叫我汪海>大神的
http://www.cnblogs.com/now-fighting/p/4495841.html Now&Fight的博文
http://blog.csdn.net/vieri_ch/article/details/45220119 python模拟浏览器,并解决response内容乱码的问题
http://iregex.org/blog/python-chinese-unicode-regular-expressions.html