from re import findall
from urllib.request import urlopen
url = 'http://news.pdsu.edu.cn/info/1005/31269.htm'
with urlopen(url) as fp:
content=fp.read().decode('utf-8')
pattern = '
#查找所有图片链接地址
result = findall(pattern, content) #捕获分组
#逐个读取图片数据,并写入本地文件
path='f:/pic/嘻嘻嘻_'##嘻嘻为照片名字
for index, item in enumerate(result):
with urlopen( 'http://news.pdsu.edu.cn'+str(item)) as fp:
with open(path+str(index+1)+'.png','wb') as fp1:
fp1.write(fp.read())
import scrapy
from bs4 import BeautifulSoup
import re
class mmlSpider(scrapy.Spider):
name = 'mml'
allowed_domains = ['pdsu.cn']
start_urls = ['http://news.pdsu.edu.cn/']
def parse(self, response):
html_doc=response.text
soup= BeautifulSoup(html_doc, 'html.parser')
re=soup.find_all('h2', class_='fl')
content=''
for lm in re:
print(lm.text)
content+=lm.text+'\n'
with open('f:\\lm.txt', 'a+') as fp:
fp.writelines(content)
#文章内容保存在F盘的lm.text中
以管理员身份打开,安装bs4和Scrapy这两个库
输入
pip install bs4
pip install scrapy
继续输入:
scrapy startproject xinwen
cd xinwen
scrapy genspider lm news.pdsu.edu.cn
scrapy crawl lm
xinwen:项目名;lm:爬虫名称;pdsu.edu.cn为爬取域名
在文件夹中找到 lm.py所在地,打开并把上面代码复制进去后运行代码打开F盘的text即可看见结果
import scrapy
import re
import requests
import bs4
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
url='http://mooc1.chaoxing.com/course/206046270.html'
response = requests.get(url,headers=headers).text
soup = bs4.BeautifulSoup(response,'html.parser')
t=soup.findAll('div',class_='f16 chapterText')
for ml in t:
print (ml.text)
以管理员身份打开cmd
输入:
scrapy startproject chaoxing
cd chaoxing
scrapy genspider lm news.mooc1.chaoxing.com/course/206046270.html
scrapy crawl lm
scrapy startproject+项目名
scrapy genspider+爬虫名+域名( 如:news.mooc1.chaoxing.com/course/206046270.html)
运行代码
import scrapy
import re
import requests
import bs4
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
url='http://mooc1.chaoxing.com/course/206046270.html'
response = requests.get(url,headers=headers).text
soup = bs4.BeautifulSoup(response,'html.parser')
t=soup.findAll('div',class_='f16 chapterText')
for ml in t:
print (ml.text)