Python爬虫学习之抓取商品名称和价格

看到网上有可以查看商城历史价格的网站,有时候查查某件想买的商品是不是历史最低价,还是蛮有用的。用了几次后就想着这种网站的功能是怎么实现的,那就是使用爬虫技术,定时爬取这些商城的商品价格,然后保存到数据库里去,以后爬取的时候如果价格没变就跳过,如果价格有变动就将变动的时间和价格插入数据库,日积月累,以后就能查找这些商品历史价格了。

原理在这了,然后想着怎么实现。开始用PHP做了一个爬取京东的小爬虫,能跑起来,就是效率太低了,因为PHP是单线程的。在网上看到爬虫都是用python写的,然后就花了3天时间学习python,做了一个简单的京东商品名称和价格的小爬虫,先把代码放上来,有时间再整理下这个小爬虫的工作流程。

附上源码供参考:

> import re 
> import urllib  
> import urllib.request  
> import urllib.parse
from collections import deque   
> import threading
> import time
> import pymysql  
> import datetime    
> g_this_date = datetime.date.today()  
> g_db = pymysql.connect(user='root', passwd='', host='localhost', db='qw', charset='utf8')
> g_db_cursor = g_db.cursor()
> g_db_sql_start = "INSERT INTO qi_jd_goods(goods_id,name,price,date_add) VALUES "
> g_db_sql_val = ''
> 
> g_queue = deque()       # 爬虫队列 
> g_visited_list = set()  # 爬过的列表 
> g_visited_item = set()  # 爬过的商品 
> g_cnt_list_all = 0      # 爬过的列表总数 
> g_cnt_list_sec = 0      # 爬过的列表成功总数 
> g_cnt_item_all = 0      # 爬过的商品总数 
> g_cnt_item_sec = 0      # 爬过的商品成功总数  
> 
> class SpiderJd(): 
>     def go(self, tid, q):
>         global g_queue
>         global g_visited_list
>         global g_visited_item
>         global g_cnt_list_all
>         global g_cnt_list_sec
>         global g_cnt_item_all
>  
>         if q:
>             this_url = q     # 队首元素出队
>             g_visited_list |= {this_url}          # 标记为已访问
>             g_cnt_list_all += 1
>             data = self.get_data(this_url)
>             if data == None:
>                 return
>             g_cnt_list_sec += 1
>             # 正则表达式提取页面中所有队列, 并判断是否已经访问过, 然后加入待爬队列
>             re_link = re.compile('href="(.+?)"')
>             link_all = re_link.findall(data)
>             for i in link_all:
>                 i_parse = urllib.parse.urlparse(i)
>                 if i_parse.path == '/list.html' and i not in g_visited_list:
>                     g_queue.append(i)
>                 if i_parse.netloc == 'item.jd.com':
>                     try:
>                         i_id = str(re.compile('/(\d+)').findall(i_parse.path)[0])
>                         if i_id not in g_visited_item:
>                             g_visited_item |= {i_id}
>                             g_cnt_item_all += 1
>                             self.get_item(tid, i_id)
>                     except:
>                         continue
>  
>     def get_item(self, tid, gid):
>         global g_cnt_item_sec
>         global g_this_date
>         this_name = self.get_name(gid)
>         this_price = self.get_price(gid)
>         g_cnt_item_sec += 1
>         # print('tid=%s, cnt_l_a=%d, cnt_l_s=%d, cnt_i_a=%d, cnt_i_s=%d, id=%s, price=%s' % (tid, g_cnt_list_all, g_cnt_list_sec,
> g_cnt_item_all, g_cnt_item_sec, gid, this_price))
>         print('tid=%s, cnt_l_a=%d, cnt_l_s=%d, cnt_i_a=%d, cnt_i_s=%d, id=%s, name=%s, price=%s' % (tid, g_cnt_list_all, g_cnt_list_sec,
> g_cnt_item_all, g_cnt_item_sec, gid, this_name, this_price))
>         pymysql_insert("('"+gid+"',\""+this_name+"\",'"+str(this_price)+"','"+str(g_this_date)+"'),")
>  
>     def get_data(self,url):
>         try:
>             urlop = urllib.request.urlopen(url, timeout=2)
>         except:
>             return None
>         if 'html' not in urlop.getheader('Content-Type'):
>             return None
>         try:
>             this_data = urlop.read()
>         except:
>             return None
>         try:
>             if 'UTF-8' in urlop.getheader('Content-Type') or 'utf-8' in urlop.getheader('Content-Type'):
>                 data = this_data.decode("UTF-8")
>             elif 'GBK' in urlop.getheader('Content-Type') or 'gbk' in urlop.getheader('Content-Type'):
>                 data = this_data.decode("GBK")
>             else:
>                 data = this_data.decode("UTF-8")
>         except:
>             return None
>         return data
>  
>     def get_price(self,gid):
>         price_url = "http://p.3.cn/prices/mgets?skuIds=J_"+gid+"&type=1"
>         try:
>             price_json = json.loads(urllib.request.urlopen(price_url).read().decode("gbk"))[0]
>             if price_json['p'] and price_json['p'] != '-1.00':
>                 return price_json['p']
>         except:
>             return 0
>         return 0
>  
>     def get_name(self,gid):
>         url = "http://item.jd.com/"+gid+".html"
>         data = self.get_data(url)
>         if data != None:
>             re_name = re.compile('
\s*

(.+?)

') > match = re_name.findall(data) > if len(match) != 0: > return match[0] > return None class myThread (threading.Thread): > def __init__(self, tid): > threading.Thread.__init__(self) > self.tid = tid > > def run(self): > global g_queue > print("Starting " + self.tid) > while g_queue: > queueLock.acquire() > q = g_queue.popleft() > queueLock.release() > > s.go(self.tid,q) > print("Exiting " + self.tid) def pymysql_insert(val): > global g_cnt_item_sec > global g_db_sql_start > global g_db_sql_val > global g_db > global g_db_cursor > if g_cnt_item_sec%100 == 0 and len(g_db_sql_val) != 0: > sql = g_db_sql_start + g_db_sql_val[:-1] > g_db_sql_val = '' > try: > # 执行sql语句 > g_db_cursor.execute(sql) > # 提交到数据库执行 > g_db.commit() > except: > # Rollback in case there is any error > g_db.rollback() > else: > g_db_sql_val += val > > > if __name__ == '__main__': > print(datetime.datetime.today()) # 开始时间 > queueLock = threading.Lock() > url = "http://www.jd.com/allSort.aspx" > g_queue.append(url) > > s = SpiderJd() > > count = 1 > while (count < 10): > if count != 1: > time.sleep(2) > thread = myThread(str(count)) > thread.start() > count = count+1 > print(datetime.datetime.today()) # 结束时间

你可能感兴趣的:(Python)