Python爪巴虫

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re

html = urlopen("https://morvanzhou.github.io/static/scraping/table.html").read().decode('utf-8')
# print(html)

soup = BeautifulSoup(html, features='lxml')

print(soup.h1)
# 

标题

print(soup.p) #

段落

# 爬取全部链接 all_href = soup.find_all('a') all_href = [l['href'] for l in all_href] print('\n', all_href) # 利用Class爬取信息 month = soup.find_all('li', {"class": "month"}) for m in month: print(m) #
  • XXX
  • print(m.get_text()) # XXX # 用正则表达式限制,爬取图片 img_links = soup.find_all("img", {"src": re.compile('.*?\.jpg')}) # 以任意字符开头,.jpg结尾 print(img_links) # [] for link in img_links: print(link['src']) # https://morvanzhou.github.io/static/img/course_cover/tf.jpg # 用正则表达式限制,爬取链接 course_links = soup.find_all('a', {'href': re.compile('https://morvan.*')}) print(course_links) # [莫烦 Python] for link in course_links: print(link['href']) # https://morvanzhou.github.io/
    import requests
    import webbrowser
    
    # get
    param = {"wd": "莫烦Python"}  # 搜索的信息
    r = requests.get('http://www.baidu.com/s', params=param)
    print(r.url)
    # http://www.baidu.com/s?wd=%E8%8E%AB%E7%83%A6Python
    webbrowser.open(r.url)
    
    # post
    data = {'firstname': '莫烦', 'lastname': ''}  # 提交的信息
    r = requests.post('http://pythonscraping.com/files/processing.php', data=data)
    print(r.text)
    # Hello there, 莫烦 周!
    
    # 上传图片
    file = {'uploadFile': open('./image.png', 'rb')}
    r = requests.post(
        'http://pythonscraping.com/files/processing2.php', files=file)
    print(r.text)
    # The file image.png has been uploaded.
    
    # session 登录操作
    session = requests.Session()
    payload = {'username': 'Morvan', 'password': 'password'}
    r = session.post(
        'http://pythonscraping.com/pages/cookies/welcome.php', data=payload)
    print(r.cookies.get_dict())
    
    # {'username': 'Morvan', 'loggedin': '1'}
    
    
    r = session.get("http://pythonscraping.com/pages/cookies/profile.php")
    print(r.text)
    
    # Hey Morvan! Looks like you're still logged into the site!

    你可能感兴趣的:(Python爪巴虫)