【Python爬虫】第十四次作业

from lxml import etree
file=open(r'C:\Users\CY\Desktop\xpath.html','r',encoding='utf-8')
html=file.read()
file.close()
selector=etree.HTML(html)
div1=selector.xpath('//div/text()')[0].strip()
div2=selector.xpath('//div/text()')[3].strip()
print(div1,div2)
# 二
ul1=selector.xpath('//ul/text()')[0].strip()
ul2=selector.xpath('//ul/text()')[6].strip()
ul3=selector.xpath('//ul/text()')[8].strip()
print(ul1,ul2,ul3)
# 三
infos=selector.xpath('//div[@class="works"][1]/ul[@class="title"][1]/li[position()<4]/a')
for info in infos:
    a_text=info.xpath('text()')[0]
    print(a_text)
    a_href=info.xpath('@href')[0]
    print(a_href)
 # 四
import requests
from lxml import etree
url='http://www.ygdy8.com/'
# headers={
#     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
#     'Accept-Encoding':'gzip, deflate',
#     'Accept-Language':'zh-CN,zh;q=0.8',
#     'Cache-Control':'max-age=0',
#     'Connection':'keep-alive',
#     'Cookie':'UM_distinctid=15e2dd3ecc65f3-0c903571f10e39-3f63450e-c0000-15e2dd3ecc931b; CNZZDATA5783118=cnzz_eid%3D1767809342-1504003387-null%26ntime%3D1504003387; 37cs_pidx=3; 37cs_user=37cs34049750185; 37cs_show=69; cscpvrich4016_fidx=3',
#     'Host':'www.ygdy8.com',
#     'If-Modified-Since':'Tue, 29 Aug 2017 11:28:52 GMT',
#     'If-None-Match':"0729ffdb920d31:530",
#     'Referer':'https://www.baidu.com/link?url=LnqhMmr1cmk2pGtcI_DH5DRcsprQywMbkFMe2ww7aB_&wd=&eqid=f8a4251600020d420000000359a54b03',
#     'Upgrade-Insecure-Requests':'1',
#     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
#  }
req = requests.get(url)
req.encoding='gb2312'
html=req.text
selector=etree.HTML(html)
infos=selector.xpath('//div[@class="contain"][1]/ul/li/a')
for info in infos:
    a_text=info.xpath('text()')
    a_href=info.xpath('@href')
    print(a_text[0],a_href[0])

你可能感兴趣的:(【Python爬虫】第十四次作业)