python进阶宝典9-操控web(爬虫与web测试等)

#! python3
## webbrowser打开页面
# Launches a map in the browser using an address from the command line or clipboard.
import webbrowser,sys,pyperclip
if len(sys.argv) > 1:
    # Get address from command line.
    address = ' '.join(sys.argv[1:])
else:
    # Get address from clipboard.
    address = pyperclip.paste()
webbrowser.open('https://www.google.com/maps/place/'+address)


## 用 requests 模块从web进行下载
# 替代python的urllib2模块,用起来太复杂
# pip install requests
import requests
res = requests.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')
type(res)
res.status_code == requests.codes.ok
print(res.status_code)
len(res.text)
print(res.text[:250])


## 检查下载是否成功在response对象上用方法 raise_for_status(),如果下载文件出错,将抛出异常。  
# 总是在requests.get()之后调用raise_for_status().
# 更优雅的处理错误,使用try except.
import requests
res = requests.get('http://www.abc.com/page_lost')
try:
    res.raise_for_status()
except Exception as exc:
    print('There was a problem: %s' % (exc))


## 下载文件保存到硬盘 
import requests
res = requests.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')
res.raise_for_status()
tfile = open('pg_get.txt','wb')              # 使用 wb 写二进制模式打开文件,目的是保存该文本的Unicode编码
for chunk in res.iter_content(100000):       # iter_content()方法在循环的每次迭代中,返回一定字节内容。
    tfile.write(chunk)
tfile.close()


## 使用 BeautifulSoup 模块解析HTML
# pip install beautifulsoup4
import requests,bs4
res = requests.get('http://nostarch.com')
res.raise_for_status()
nosoup = bs4.BeautifulSoup(res.text)
type(nosoup)
print(nosoup)


## 
efile = open('example.html')
nosoup = bs4.BeautifulSoup(efile.read())          # 可以直接解析本地网页文件
elems = nosoup.select('#author')
type(elems)
len(elems)
type(elems[0])
elems[0].getText()
str(elems[0])
elems[0].attrs

pelems = nosoup.select('p')
str(pelems[0])
pelems[0].getText()
str(pelems[1])
pelems[1].getText()

selems = nosoup.select('span')[0]
str(selems)
selems.get('id')
selems.attrs


## 打开页面,并自动打开页面上的5个链接
import requests,sys,webbrowser,bs4
print('Ebscn...')
res = requests.get('http://www.ebscn.com/')
res.raise_for_status()
len(res.text)
# Retrieve top search result links.
soup = bs4.BeautifulSoup(res.text)
print(soup)
# Open a browser tab for each result.
linkElems = soup.select('li a')
numOpen = min(5,len(linkElems))
for i in range(numOpen):
    webbrowser.open(linkElems[i].get('href'))


## 漫画站下载
# 1.利用requests模块下载页面
# 2.利用Beautiful Soup找到页面中漫画图像的URL.
# 3.利用iter_content()下载漫画图像,并保存到硬盘
# 4.找到前一张漫画的链接URL,然后重复
import requests,os,bs4
url = 'http://xkcd.com'               # starting url
os.makedirs('xkcd',exist_ok=True)     # store comics in ./xkcd
while not url.endswith('#'):
    # Download the page.
    print('Downlaoding page %s...' % url)
    res = requests.get(url)
    res.raise_for_status()
    # Find the URL of the comic image.
    soup = bs4.BeautifulSoup(res.text)
    comicElem = soup.select('#comic img')
    if comicElem == []:
        print('Could not find comic image.')
    else:
        comicUrl = 'http:'+comicElem[0].get('src')
        # Download the image.
        print('Downloading image %s...' % (comicUrl))
        res = requests.get(comicUrl)
        res.raise_for_status()
        # Save the image to ./xkcd.
        imageFile = open(os.path.join('xkcd',os.path.basename(comicUrl)),'wb')
        for chunk in res.iter_content(100000):
            imageFile.write(chunk)
        imageFile.close()
    # Get the Prev button's url.
    prevLink = soup.select('a[rel="prev"]')[0]
    url = 'http://xkcd.com' + prevLink.get('href')
print('Done.')
        
## 用 selenium 模块控制浏览器
#       webdriver方法名                               返回的WebElement对象
# browser.find_element_by_class_name(name)         使用CSS类name的元素
# browser.find_element_by_css_selector(selector)   匹配CSS selector的元素
# browser.find_element_by_id(id)                   匹配 id 属性值的元素
# browser.find_element_by_link_text(text)          完全匹配提供的text的元素
# browser.find_element_by_partial_link_text(text)  包含提供的text的
元素
# browser.find_element_by_name(name)               匹配name属性值的元素
# browser.find_element_by_tag_name(name)           匹配标签name的元素
# 以上所有方法返回一个WebElement对象,代表页面中匹配查询的第一个元素
# browser.find_elements_*   返回所有匹配的对应元素
# 返回的WebElement对象也有各种属性和方法
#
# pip install selenium
from selenium import webdriver
# browser = webdriver.Firefox()
# browser = webdriver.Ie()
browser = webdriver.Chrome()
browser.get('http://inventwithpython.com')
try:
    elem = browser.find_element_by_class_name('nav-link')
    print('Found <%s> element with that class name!' % (elem.tag_name))
except:
    print('Was not able to find an element with that name.')
# 点击页面
linkElem = browser.find_element_by_link_text('Read It Online')
type(linkElem)
linkElem.click()


## 填写并提交表单
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('http://gmail.com')
emailElem = browser.find_element_by_id('Email')       # 获取页面元素
emailElem.send_keys('[email protected]')    # 填写内容
passwdElem = browser.find_element_by_id('Passwd')
passwdElem.send_keys('123456')
passwdElem.submit()                                   # 提交,等同于页面表单的submit按钮


## 发送特殊键值
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
browser.get('http://nostarch.com')
htmlElem = browser.find_element_by_tag_name('html')
htmlElem.send_keys(Keys.END)                          # scrolls to bottom
htmlElem.send_keys(Keys.HOME)                         # scrolls to top


## 模拟浏览器按钮
browser.back()
browser.forward()
browser.refresh()
browser.quit()


## selenium 还可以修改cookie,截取页面快照,运行定制的JavaScript, 功能十分强大
#  具体参见 http://selenium-python.readthedocs.org

你可能感兴趣的:(python)