import urllib
import urllib2
url='http://www.zhihu.com/#signin'
user_agent='MOZILLA/5.0'
values={'username':'[email protected]','password':'xxx'}
headers={'User-Agent':user_agent}
data=urllib.urlencode(values) # urlencode 是 urllib 独有的方法
request=urllib2.Request(url,data,headers)
response=urllib2.urlopen(request)
page=response.read()
import requests
url = "http://www.baidu.com"
r = requests.get(url)
import urllib2
request = urllib2.Request(url)
response = urllib2.urlopen(request)
page = response.read()
beautifulsoup
soup = BeautifulSoup(html,'lxml')
print(soup.prettify())
print(soup.title)找到title节点
print(soup.title.name)找到title节点的名字
print(soup.title.string)找到title节点的text
print(soup.title.parent.name)找到title节点的父节点
print(soup.p)找到p节点
print(soup.p["class"])找到p节点的class属性
print(soup.a)找到a节点
print(soup.find_all('a'))找到所有a节点
print(soup.find(id='link3'))找到属性为“link3”的节点
for i,child in enumerate(soup.p.children):
print(i,child)
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
xpath
html = etree.parse('test.html')
html_data = etree.tostring(html,pretty_print=True)
res = html_data.decode('utf-8')
html_data = html.xpath('/html/body/div/ul/li/a')
for i in html_data:
print(i.text)
html_data = html.xpath('/html/body/div/ul/li/a/@href')
for i in html_data:
print(i)##打印指定路径下a标签的属性
html_data = html.xpath('/html/body/div/ul/li/a[@href="link2.html"]/text()')
print(html_data)##查到绝对路径下a标签属性等于link2.html的内容。
for i in html_data:
print(i)
html_data = html.xpath('//li/a/text()')
print(html_data)#查找相对路径,例如,查找所有li标签下的a标签内容。
for i in html_data:
print(i)
html_data = html.xpath('//li/a//@href')
print(html_data)#查找一下l相对路径下li标签下的a标签下的href属性的值,注意,a标签后面需要双//
for i in html_data:
print(i)
html = etree.HTML(wb_data)
html_data = html.xpath('//li/a[@href="link2.html"]')
print(html_data)
for i in html_data:
print(i.text)
html = etree.HTML(wb_data)
html_data = html.xpath('//li[last()]/a/text()')#查找最后一个li标签里的a标签内容
print(html_data)
for i in html_data:
print(i)
抓取文件存储本地
import urllib
import urllib2
import os
def getPage(url):
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
url='http://www.baidu.com'
result=getPage(url)
file_name='test.txt'
file_path='D:/test'
if os.path.exists(file_path) == False:
os.makedirs(file_path)
local=os.path.join(file_path,file_name)
f = open(local,"w+")
f.write(result)
f.close()
url='http://www.baidu.com' #把该地址改成图片/文件/视频/网页的地址即可
result=getPage(url)
file_name='test.txt'
file_path='D:/test'
if os.path.exists(file_path) == False:
os.makedirs(file_path)
local=os.path.join(file_path,file_name)
urllib.urlretrieve(local)