lxml: 它可以分析xml文件,html是xml的子集,所以分析html文档可以使用正则也可以使用lxml
示例文档
li test3
Harry Potter
J K. Rowling
2005
29.99
li test1
li test2
li test4
lxml示例
实例1: 找到Harry Potter
/bookstore/book/title
实例: 找到book里面所有li
/bookstore/book/li
实例: 找到bookstore里面所有li
/bookstore/book/li|/bookstore/li (|表示或的意思)
/bookstore//li //表示不管层次只要是li全部找到
实例: 找到整个文档中的li
//li
实例:找到所有含有id属性的li
//li[@id]
实例:找到所有含有id属性的li,并且id的值为test3
//li[@id=‘test3’]
实例:找到所有li的id属性
//li/@id 得到标签中的属性值
//li/text() 得到标签中的内容
一个完整示例:
from lxml import etree
html = '''
li test3
Harry Potter
J K. Rowling
2005
29.99
li test1
li test2
li test4
'''
dom = etree.HTML(html)
ret = dom.xpath('//li/text()')
print(ret)
ret = dom.xpath('//li/@id')
print(ret)
一个完整示例:
from lxml import etree
html = '''
li test3
Harry Potter
J K. Rowling
2005
29.99
li test1
li test2
li test4
'''
dom = etree.HTML(html)
ret = dom.xpath('//li[@id]')
for li in ret:
print(li.text)
print(li.attrib['id'])
print(etree.tostring(li).decode())
print('=' * 50)
#爬取暴漫非人哉漫画
#author : shuaijie_liu
#date 2019-05-01
#email : [email protected]
import requests
from lxml import etree
def down_html(url,timeout=10,headers=None,verify=True):
if not headers:
headers = {
'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36,'
}
req = requests.get(url=url,headers=headers,verify=verify,timeout=timeout)
return req.text
def find_imgs(data,exp):
dom = etree.HTML(data)
ret = dom.xpath(exp)
return ret
def download_img(url,filename,timeout=10,headers=None,verify=True):
if not headers:
headers = {
'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36,'
}
req = requests.get(url=url,headers=headers,verify=verify,timeout=timeout)
with open(filename,'wb') as f:
f.write(req.content)
if __name__ == '__main__':
for page in range(27):
url = r'http://baozoumanhua.com/channels/1562?page={}'.format(page)
imgs = r'//div[@class="article-body"]//img/@src'
try:
html = down_html(url=url)
except Exception as e:
print('Html Error {} : {}'.format(url,e))
continue
img_urls = find_imgs(html,imgs)
ret = [img_urls[0]]
for url in img_urls:
if url != ret[-1]: ret.append(url)
filename = 0
for url in ret:
filename += 1
file = "{}-{}.jpg".format(page+1,filename)
print('down load {}'.format(url))
try:
download_img(url,file)
except Exception as e:
print('IMAGE ERROR {}:{}'.format(url,e))