第四章--python爬虫常用模块
#! python2.7
#-*- coding:utf-8 -*-
import urllib2
def linkBaidu():
url='http://www.baidu.com'
try:
response=urllib2.urlopen(url,timeout=4)
except urllib2.URLError:
print("网络地址错误")
exit()
with open('baiduResponse.txt','w') as fp: #写入文档
fp.write(response.read())
print(response.geturl()) #获取url信息
print(response.getcode()) #返回状态码
print(response.info()) #返回信息
if __name__=='__main__':
linkBaidu()
#-*-coding:utf-8 -*-
'''
测试代理proxy是否有效
'''
import urllib2,re
class TestProxy():
def __init__(self,proxy):
self.proxy=proxy
self.checkProxyFormat(self.proxy)
self.url='http://www.baidu.com'
self.timeout=4
self.keyword='百度' #在网页返回的数据中查找这个词
self.useProxy(proxy)
def checkProxyFormat(self,proxy):
try:
match=re.compile(r'^http[s]?://[\d]{1,3}.[\d]{1,3}.[\d]{1,3}.[\d]{1,3}:[\d]{1,5}$')
match.search(proxy).group()
except AttributeError:
print("你输入的代理地址格式不正确")
exit()
flag=1
proxy=proxy.replace('//','')
try:
protocol=proxy.split(':')[0]
ip=proxy.split(':')[1]
port=proxy.split(':')[2]
except IndexError:
print('下标出界')
exit()
flag=flag and ip.split('.')[0] in map(str,xrange(1,256)) #map对每个数应用到str函数
flag=flag and ip.split('.')[1] in map(str,xrange(256))
flag=flag and ip.split('.')[2] in map(str,xrange(256))
flag=flag and ip.split('.')[3] in map(str,xrange(1,255))
flag=flag and protocol in ['http','https']
flag=flag and port in map(str,xrange(1,65535))
if flag:
print('输入的http代理服务器符合标准')
else:
exit()
def useProxy(self,proxy):
protocol=proxy.split('//')[0].replace(':','')
ip=proxy.split('//')[1]
print(protocol,ip)
'''
build_opener ()返回的对象具有open()方法,与urlopen()函数的功能相同,
install_opener 用来创建(全局)默认opener。这个表示调用urlopen将使用你安装的opener
'''
opener=urllib2.build_opener(urllib2.ProxyHandler({protocol:ip})) #protocol:http ip:163.125.68.237:8888
urllib2.install_opener(opener)
for i in range(10):
try:
response=urllib2.urlopen(self.url,timeout=5)
break
except Exception as e:
print(e)
str=response.read()
if re.search(self.keyword,str):
print("已提取特征词,该代理可用")
else:
print('该代理不可用')
if __name__=='__main__':
proxy=r'http://163.125.68.237:8888'
TestProxy(proxy)
#-*-coding:utf-8 -*-
import userAgents
import urllib2
class ModifyHeader():
def __init__(self):
piua=userAgents.pcUserAgent.get('IE 9.0')
muua=userAgents.mobileUserAgent.get('UC standard')
print('piua: '+piua)
self.url='http://fanyi.youdao.com'
self.userAgent(piua,1)
self.userAgent(muua,2)
def userAgent(self,agent,name):
request=urllib2.Request(self.url)
request.add_header(agent.split(':')[0],agent.split(':')[1])
response=urllib2.urlopen(request)
filename=str(name)+'.html'
with open(filename,'w') as fp:
fp.write('%s\n\n'%agent)
fp.write(response.read())
if __name__=='__main__':
ModifyHeader()
getpass.getuser()
返回当前用户名。这个函数会按顺序检查环境变量LOGNAME, USER, LNAME和USERNAME。返回第一个非空的值。如果检查不到非空的值,模块会尝试导入pwd模块,如果系统支持pwd模块,会返回通过pwd模块获取的用户名,否则报错。