shell:
1,处理locationProbe的linux shell脚本:
cat LocationProbe.csv | cut -d ',' -f 3,20,23|sed '1d'|sort -u -n>location_dealed.csv
2,将时间戳转换为可读时间的方法:
其中@后面跟随的是时间戳的方式
date -d "@1385991753" "+%Y/%m/%d %H:%M:%S"
shell:
sort
uniq
cut
grep
sed
awk
wc
python:
import sys import codecs import pickle #from operator import itemgetter #from random import shuffle from exceptions import ValueError from numpy.random import shuffle from recsys.algorithm import VERBOSE class Data: """ Handles the relationshops among users and items """ def __init__(self): #""" #:param data: a list of tuples #:type data: list #""" self._data = list([]) def __repr__(self): s = '%d rows.' % len(self.get()) if len(self.get()): s += '\nE.g: %s' % str(self.get()[0]) return s def __len__(self): return len(self.get()) def __getitem__(self, i): if i < len(self._data): return self._data[i] return None def __iter__(self): return iter(self.get()) def set(self, data, extend=False): """ Sets data to the dataset :param data: a list of tuples :type data: list """ if extend: self._data.extend(data) else: self._data = data def get(self): """ :returns: a list of tuples """ return self._data def add_tuple(self, tuple): """ :param tuple: a tuple containing <rating, user, item> information (e.g. <value, row, col>) """ #E.g: tuple = (25, "ocelma", "u2") -> "ocelma has played u2 25 times" if not len(tuple) == 3: raise ValueError('Tuple format not correct (should be: <value, row_id, col_id>)') value, row_id, col_id = tuple if not value and value != 0: raise ValueError('Value is empty %s' % (tuple,)) if isinstance(value, basestring): raise ValueError('Value %s is a string (must be an int or float) %s' % (value, tuple,)) if row_id is None or row_id == '': raise ValueError('Row id is empty %s' % (tuple,)) if col_id is None or col_id == '': raise ValueError('Col id is empty %s' % (tuple,)) self._data.append(tuple) def split_train_test(self, percent=80, shuffle_data=True): """ Splits the data in two disjunct datasets: train and test :param percent: % of training set to be used (test set size = 100-percent) :type percent: int :param shuffle_data: shuffle dataset? :type shuffle_data: Boolean :returns: a tuple <Data, Data> """ if shuffle_data: shuffle(self._data) length = len(self._data) train_list = self._data[:int(round(length*percent/100.0))] test_list = self._data[-int(round(length*(100-percent)/100.0)):] train = Data() train.set(train_list) test = Data() test.set(test_list) return train, test def load(self, path, force=True, sep='\t', format=None, pickle=False): """ Loads data from a file :param path: filename :type path: string :param force: Cleans already added data :type force: Boolean :param sep: Separator among the fields of the file content :type sep: string :param format: Format of the file content. Default format is 'value': 0 (first field), then 'row': 1, and 'col': 2. E.g: format={'row':0, 'col':1, 'value':2}. The row is in position 0, then there is the column value, and finally the rating. So, it resembles to a matrix in plain format :type format: dict() :param pickle: is input file in pickle format? :type pickle: Boolean """ if VERBOSE: sys.stdout.write('Loading %s\n' % path) if force: self._data = list([]) if pickle: self._load_pickle(path) else: i = 0 for line in codecs.open(path, 'r', 'utf8'): data = line.strip('\r\n').split(sep) value = None if not data: raise TypeError('Data is empty or None!') if not format: # Default value is 1 try: value, row_id, col_id = data except: value = 1 row_id, col_id = data else: try: # Default value is 1 try: value = data[format['value']] except KeyError, ValueError: value = 1 try: row_id = data[format['row']] except KeyError: row_id = data[1] try: col_id = data[format['col']] except KeyError: col_id = data[2] row_id = row_id.strip() col_id = col_id.strip() if format.has_key('ids') and (format['ids'] == int or format['ids'] == 'int'): try: row_id = int(row_id) except: print 'Error (ID is not int) while reading: %s' % data #Just ignore that line continue try: col_id = int(col_id) except: print 'Error (ID is not int) while reading: %s' % data #Just ignore that line continue except IndexError: #raise IndexError('while reading %s' % data) print 'Error while reading: %s' % data #Just ignore that line continue # Try to convert ids to int try: row_id = int(row_id) except: pass try: col_id = int(col_id) except: pass # Add tuple try: self.add_tuple((float(value), row_id, col_id)) except: if VERBOSE: sys.stdout.write('\nError while reading (%s, %s, %s). Skipping this tuple\n' % (value, row_id, col_id)) #raise ValueError('%s is not a float, while reading %s' % (value, data)) i += 1 if VERBOSE: if i % 100000 == 0: sys.stdout.write('.') if i % 1000000 == 0: sys.stdout.write('|') if i % 10000000 == 0: sys.stdout.write(' (%d M)\n' % int(i/1000000)) if VERBOSE: sys.stdout.write('\n') def _load_pickle(self, path): """ Loads data from a pickle file :param path: output filename :type param: string """ self._data = pickle.load(codecs.open(path)) def save(self, path, pickle=False): """ Saves data in output file :param path: output filename :type param: string :param pickle: save in pickle format? :type pickle: Boolean """ if VERBOSE: sys.stdout.write('Saving data to %s\n' % path) if pickle: self._save_pickle(path) else: out = codecs.open(path, 'w', 'utf8') for value, row_id, col_id in self._data: try: value = unicode(value, 'utf8') except: if not isinstance(value, unicode): value = str(value) try: row_id = unicode(row_id, 'utf8') except: if not isinstance(row_id, unicode): row_id = str(row_id) try: col_id = unicode(col_id, 'utf8') except: if not isinstance(col_id, unicode): col_id = str(col_id) s = '\t'.join([value, row_id, col_id]) out.write(s + '\n') out.close() def _save_pickle(self, path): """ Saves data in output file, using pickle format :param path: output filename :type param: string """ pickle.dump(self._data, open(path, "w"))