爬虫一些代码

import urllib
import urllib2
url='http://www.zhihu.com/#signin'
user_agent='MOZILLA/5.0'
values={'username':'[email protected]','password':'xxx'}
headers={'User-Agent':user_agent}
data=urllib.urlencode(values) # urlencode 是 urllib 独有的方法
request=urllib2.Request(url,data,headers) 
response=urllib2.urlopen(request) 
page=response.read()

import requests
url = "http://www.baidu.com"
r = requests.get(url)

import urllib2
request = urllib2.Request(url)  
response = urllib2.urlopen(request)  
page = response.read()

 

beautifulsoup

soup = BeautifulSoup(html,'lxml')
print(soup.prettify())
print(soup.title)找到title节点
print(soup.title.name)找到title节点的名字
print(soup.title.string)找到title节点的text
print(soup.title.parent.name)找到title节点的父节点
print(soup.p)找到p节点
print(soup.p["class"])找到p节点的class属性
print(soup.a)找到a节点
print(soup.find_all('a'))找到所有a节点
print(soup.find(id='link3'))找到属性为“link3”的节点
for i,child in enumerate(soup.p.children):
    print(i,child)
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))

xpath

html = etree.parse('test.html')
html_data = etree.tostring(html,pretty_print=True)
res = html_data.decode('utf-8')

html_data = html.xpath('/html/body/div/ul/li/a')
for i in html_data:
    print(i.text)

html_data = html.xpath('/html/body/div/ul/li/a/@href')
for i in html_data:
    print(i)##打印指定路径下a标签的属性

html_data = html.xpath('/html/body/div/ul/li/a[@href="link2.html"]/text()')
print(html_data)##查到绝对路径下a标签属性等于link2.html的内容。
for i in html_data:
    print(i)

html_data = html.xpath('//li/a/text()')
print(html_data)#查找相对路径,例如,查找所有li标签下的a标签内容。
for i in html_data:
    print(i)

html_data = html.xpath('//li/a//@href')
print(html_data)#查找一下l相对路径下li标签下的a标签下的href属性的值,注意,a标签后面需要双//
for i in html_data:
    print(i)

html = etree.HTML(wb_data)
html_data = html.xpath('//li/a[@href="link2.html"]')
print(html_data)
for i in html_data:
    print(i.text)

html = etree.HTML(wb_data)
html_data = html.xpath('//li[last()]/a/text()')#查找最后一个li标签里的a标签内容
print(html_data)
for i in html_data:
    print(i)

抓取文件存储本地

import urllib  
import urllib2  
import os
def getPage(url):     
        request = urllib2.Request(url)  
        response = urllib2.urlopen(request)  
        return response.read()  


url='http://www.baidu.com'  
result=getPage(url)  
file_name='test.txt'
file_path='D:/test'
if os.path.exists(file_path) == False:
    os.makedirs(file_path)
local=os.path.join(file_path,file_name)
f = open(local,"w+")  
f.write(result) 
f.close()




url='http://www.baidu.com'  #把该地址改成图片/文件/视频/网页的地址即可
result=getPage(url)  
file_name='test.txt'
file_path='D:/test'
if os.path.exists(file_path) == False:
    os.makedirs(file_path)
local=os.path.join(file_path,file_name)
urllib.urlretrieve(local)

 

你可能感兴趣的:(python)