python 爬图 helloworld

最近发现 吾志 上用户的头像都很个性,另外,对于没有把日记设为私密的用户,当天的日记是公开的,谁都可以查看。

所以,如果每天把所有可查看的日记爬一遍,那么~~ 哈哈

 

我以前对爬虫只是了解一点点,没有真的玩过。既然今晚兴致来了,那就随便学一下咯~

 

参考 http://cuiqingcai.com/1052.html

 

 1 #coding=utf-8

 2 import os

 3 import urllib

 4 import urllib2

 5 import re

 6 import cookielib

 7 

 8 

 9 

10 def mkdir(path):

11     # 去除左右两边的空格

12     path = path.strip()

13     # 去除尾部 \ 符号

14     path = path.rstrip("\\")

15 

16     if not os.path.exists(path):

17         os.makedirs(path)

18 

19     return path

20 

21 

22 def save_file(path, file_name, data):

23     if data == None:

24         return

25 

26     mkdir(path)

27     if (not path.endswith("/")):

28         path = path + "/"

29     f = open(path+file_name, "wb")

30     f.write(data)

31     f.flush()

32     f.close()

33 

34 

35 

36 user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'

37 headers = {'User-Agent' : user_agent}

38 values = {}

39 data = urllib.urlencode(values)

40 

41 def getHtml(url):

42     req = urllib2.Request(url, data, headers)

43     page = urllib2.urlopen(req, timeout=10)

44     html = page.read()

45     page.close()

46     #print html

47     return html

48 

49 def get_file(url):

50     try:

51         opener = urllib2.build_opener()

52         opener.addheaders = [('User-Agent', 'Mozilla/5.0')]

53         urllib2.install_opener(opener)

54         req = urllib2.Request(url)

55         operate = opener.open(req)

56         data = operate.read()

57         operate.close()

58         return data

59     except BaseException, e:

60         print e, 'fuck'

61         return None

62 

63 

64 def getImg(html):

65     reg = r'src="(.+?\.jpg)" alt='

66     imgre = re.compile(reg)

67     imglist = re.findall(imgre, html)

68 

69     x = 0

70     for imgurl in imglist:

71         #urllib.urlretrieve(imgurl, '%s.jpg' % x)

72         da = get_file(imgurl)

73         save_file('.', '%s.jpg' % x, da)

74         x += 1

75 

76     return x

77 

78 

79 

80 html = getHtml("https://wuzhi.me/last")

81 

82 print getImg(html)

 

十分简陋,哈哈~

你可能感兴趣的:(helloworld)