import urllib.request
import os
#to open the url
def url_open(url):
req=urllib.request.Request(url)
req.add_header(
'User-Agent'
,
'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
)
response=urllib.request.urlopen(url)
html=response.read()
return
html
#to get the num of page like 1,2,3,4...
def get_page(url):
html=url_open(url).decode(
'utf-8'
)
a=html.find(
'current-comment-page'
)+23 #add the 23 offset th arrive at the [2356]
b=html.find(
']'
,a)
#
print
(html[a:b])
return
html[a:b]
#find the url of imgs
and
return
the url of arr
def find_imgs(url):
html=url_open(url).decode(
'utf-8'
)
img_addrs=[]
a=html.find(
'img src='
)
while
a!=-1:
b=html.find(
'.jpg'
,a,a+255) #
if
false :
return
-1
if
b!=-1:
img_addrs.append(
'http:'
+html[a+9:b+4])
else
:
b=a+9
a=html.find(
'img src='
,b)
#
print
(img_addrs)
return
img_addrs
#
print
(
'http:'
+each)
#save the imgs
def save_imgs(folder,img_addrs):
for
each in img_addrs:
filename=each.split(
'/'
)[-1] #get the last member of arr,that is the name
with open(filename,
'wb'
)
as
f:
img = url_open(each)
f.write(img)
def download_mm(folder=
'mm'
,pages=10):
os.
mkdir
(folder)
os.
chdir
(folder)
url=
'http://jandan.net/ooxx/'
page_num=int(get_page(url))
for
i in range(pages):
page_num -= i
page_url = url +
'page-'
+ str(page_num) +
'#comments'
img_addrs=find_imgs(page_url)
save_imgs(folder,img_addrs)
if
__name__ ==
'__main__'
:
download_mm()