【Python爬虫】 xpath过滤标签提取url

首页菜单栏解析

# -*- coding: utf-8 -*-
#爬虫阳光电影网
from lxml import etree
import requests
url='http://www.ygdy8.com'
req = requests.get(url)
status_code = req.status_code
#print(status_code)
#网页解码方式
req.encoding='gb2312'
#获取网页源码 用html变量接收 text或content
html = req.text
#print(html)
selector = etree.HTML(html)
#提取菜单栏url
#infors = selector.xpath('//div[@id="menu"]/div[@class="contain"]/ul/li/a')
infros = selector.xpath('//div[@id="menu"]/div[@class="contain"]/ul/li[position()<10]/a')
#看出有多少个标签
#print(len(infors))
for info in infros:
    menu_url_1=info.xpath('@href')
    menu_name_1=info.xpath('text()')
    #print(menu_name[0],menu_url[0])
    if len(menu_name_1)==0:
        pass
    elif menu_url_1[0]=='/html/gndy/index.html':
        pass
    else:
        menu_url=url+menu_url_1[0]#menu_url=/html/gndy/dyzz/index.html
        menu_name=menu_name_1[0]
        #print(menu_name,menu_url)

req2=requests.get(menu_url)
req2.encoding='gb2312'
html2=req2.text
#print(html2)

你可能感兴趣的:(【Python爬虫】 xpath过滤标签提取url)