一个简单的Python写的XML爬虫;Python访问网页

Python访问网页

2011-09-15 15:21:21|  分类:派森程序点滴|举报|字号订阅

使用Python访问网页主要有三种方式: urllib, urllib2, httplib

urllib比较简单,功能相对也比较弱,httplib简单强大,但好像不支持session

1. 最简单的页面访问

import urllib2

res=urllib2.urlopen(url)

except urllib2.URLError, e:

print res.read()

2. 加上要get或post的数据

data={"name":"hank", "passwd":"hjz"}

urllib2.urlopen(url, urllib.urlencode(data))

3. 加上http头

header={"User-Agent": "Mozilla-Firefox5.0"}

urllib2.urlopen(url, urllib.urlencode(data), header)使用opener和handler

opener = urllib2.build_opener(handler)

urllib2.install_opener(opener)

4. 加上session

cj = cookielib.CookieJar()

cjhandler=urllib2.HTTPCookieProcessor(cj)

opener = urllib2.build_opener(cjhandler)

urllib2.install_opener(opener)

5. 加上Basic认证

password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()

top_level_url = "http://www.163.com/"

password_mgr.add_password(None, top_level_url, username, password)

handler = urllib2.HTTPBasicAuthHandler(password_mgr)

opener = urllib2.build_opener(handler)

urllib2.install_opener(opener)

6. 使用代理

proxy_support = urllib2.ProxyHandler({"http":"http://1.2.3.4:3128/"})

opener = urllib2.build_opener(proxy_support)

urllib2.install_opener(opener)

7. 设置超时

socket.setdefaulttimeout(5)参考:http://svn.python.org/projects/python/trunk/Doc/howto/urllib2.rst

一个简单的Python写的XML爬虫

http://www.veryhuo.com/a/view/11163.html

#-*- encoding: utf-8 -*- 

import codecs 

import sys 

import threading 

from urllib import urlencode 

from urllib2 import urlopen 

from xml.dom.minidom import parseString 

class Serach: 

def __init__(self, key=None): 

self.key = key 

def SendPy(self, key): 

try: 

contentpy = urlopen("http://xxxx.com/ac_box?ac=" + self.key).read() 

except: 

print ("down load py!") 

try: 

xmldoc = parseString(contentpy) 

except: 

print ("ill formed xml file") 

root = xmldoc.documentElement 

''分析XML的结构,得到数组 

keyList = root.getElementsByTagName('SuggestWord') 

return keyList 

def SendKey(self, keyword): 

keyword = keyword.encode('gbk') 

tupleList = [] 

try: 

''读XML地址,转码 

content = urlopen("http://xxxx.com/btinfo?keyword=" + keyword + "&num=1").read() 

content = unicode(content, "cp936").encode("utf-8") 

except: 

print ("down load key!") 

''替换 

content = content.replace('''''', '''''') 

try: 

xmldoc = parseString(content) 

except: 

print ("ill formed xml file") 

try: 

query = xmldoc.getElementsByTagName('Query')[0] 

tupleList = query.getAttribute('ErrorCode') 

except: 

tupleList = 104 

return tupleList 

def run(self): 

ls = self.SendPy(self.key) 

count = len(self.key) 

cur = self.conn.cursor() 

str = '' 

for doc in ls: 

tuple = doc.firstChild.data 

text = self.SendKey(tuple) 

if text == '0': 

test = self.MySQLKey(tuple) 

if test != '2': 

str = str + tuple + '|' + test + ',' 

if count > 3: 

sitetag = self.MySQLPy(self.key) 

if sitetag != (): 

for x in sitetag: 

tsql = "xxxx" 

cur.execute(tsql) 

#print(cur.fetchall()) 

for s in cur.fetchall(): 

if (s[0]=='rmvb') or (s[0]=='rm'): 

r = '0' 

else: 

r = '1' 

str = str + x[0] + '|' + r + ',' 

str = str[:-1] 

else: 

str = str[:-1] 

#转成数组后过滤重复字段 

strtag = list(set(str.split(','))) 

sText = ','.join(strtag) 

file_object = codecs.open(self.savePath + self.key + '.txt', 'w', 'utf-8') 

file_object.write(sText) 

file_object.close() 

if __name__ == "__main__": 

if len(sys.argv) > 1: 

s = Serach(sys.argv[1]); 

s.run()

你可能感兴趣的:(一个简单的Python写的XML爬虫;Python访问网页)