Python访问网页
2011-09-15 15:21:21| 分类:派森程序点滴|举报|字号订阅
使用Python访问网页主要有三种方式: urllib, urllib2, httplib
urllib比较简单,功能相对也比较弱,httplib简单强大,但好像不支持session
1. 最简单的页面访问
import urllib2
res=urllib2.urlopen(url)
except urllib2.URLError, e:
print res.read()
2. 加上要get或post的数据
data={"name":"hank", "passwd":"hjz"}
urllib2.urlopen(url, urllib.urlencode(data))
3. 加上http头
header={"User-Agent": "Mozilla-Firefox5.0"}
urllib2.urlopen(url, urllib.urlencode(data), header)使用opener和handler
opener = urllib2.build_opener(handler)
urllib2.install_opener(opener)
4. 加上session
cj = cookielib.CookieJar()
cjhandler=urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cjhandler)
urllib2.install_opener(opener)
5. 加上Basic认证
password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
top_level_url = "http://www.163.com/"
password_mgr.add_password(None, top_level_url, username, password)
handler = urllib2.HTTPBasicAuthHandler(password_mgr)
opener = urllib2.build_opener(handler)
urllib2.install_opener(opener)
6. 使用代理
proxy_support = urllib2.ProxyHandler({"http":"http://1.2.3.4:3128/"})
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
7. 设置超时
socket.setdefaulttimeout(5)参考:http://svn.python.org/projects/python/trunk/Doc/howto/urllib2.rst
一个简单的Python写的XML爬虫
http://www.veryhuo.com/a/view/11163.html
#-*- encoding: utf-8 -*-
import codecs
import sys
import threading
from urllib import urlencode
from urllib2 import urlopen
from xml.dom.minidom import parseString
class Serach:
def __init__(self, key=None):
self.key = key
def SendPy(self, key):
try:
contentpy = urlopen("http://xxxx.com/ac_box?ac=" + self.key).read()
except:
print ("down load py!")
try:
xmldoc = parseString(contentpy)
except:
print ("ill formed xml file")
root = xmldoc.documentElement
''分析XML的结构,得到数组
keyList = root.getElementsByTagName('SuggestWord')
return keyList
def SendKey(self, keyword):
keyword = keyword.encode('gbk')
tupleList = []
try:
''读XML地址,转码
content = urlopen("http://xxxx.com/btinfo?keyword=" + keyword + "&num=1").read()
content = unicode(content, "cp936").encode("utf-8")
except:
print ("down load key!")
''替换
content = content.replace('''''', '''''')
try:
xmldoc = parseString(content)
except:
print ("ill formed xml file")
try:
query = xmldoc.getElementsByTagName('Query')[0]
tupleList = query.getAttribute('ErrorCode')
except:
tupleList = 104
return tupleList
def run(self):
ls = self.SendPy(self.key)
count = len(self.key)
cur = self.conn.cursor()
str = ''
for doc in ls:
tuple = doc.firstChild.data
text = self.SendKey(tuple)
if text == '0':
test = self.MySQLKey(tuple)
if test != '2':
str = str + tuple + '|' + test + ','
if count > 3:
sitetag = self.MySQLPy(self.key)
if sitetag != ():
for x in sitetag:
tsql = "xxxx"
cur.execute(tsql)
#print(cur.fetchall())
for s in cur.fetchall():
if (s[0]=='rmvb') or (s[0]=='rm'):
r = '0'
else:
r = '1'
str = str + x[0] + '|' + r + ','
str = str[:-1]
else:
str = str[:-1]
#转成数组后过滤重复字段
strtag = list(set(str.split(',')))
sText = ','.join(strtag)
file_object = codecs.open(self.savePath + self.key + '.txt', 'w', 'utf-8')
file_object.write(sText)
file_object.close()
if __name__ == "__main__":
if len(sys.argv) > 1:
s = Serach(sys.argv[1]);
s.run()