Python模拟实现爱奇艺网站日志产生

import random
import time

电视剧

url_paths = [
“www/2”,
“www/1”,
“www/6”,
“www/4”
]
#免费、付费
url_price = [
“0”,
“2”
]
#内地、香港、韩国、美剧、日本、泰国、台湾、英国、其它
url_area = [
“15”,
“16”,
“17”,
“18”,
“309”,
“1114”,
“1117”,
“28916”,
“19”
]
#自制、古装、言情、武侠、偶像、家庭、青春、都市、喜剧
#战争、军旅、谍战、悬疑、罪案、穿越、宫廷、历史、神话、
#科幻、年代、农村、商战、剧情、奇幻、网剧
url_style = [
“”,
“11992”,
“24”,
“20”,
“23”,
“30”,
“1654”,
“1653”,
“24064”,
“135”,
“27916”,
“1655”,
“290”,
“32”,
“149”,
“148”,
“139”,
“21”,
“145”,
“34”,
“27”,
“29”,
“140”,
“24063”,
“27881”,
“24065”
]

时间年代

url_time = [
“”,
“2018”,
“2017”,
“2016”,
“2011_2015”,
“2000_2010”,
“1990_1999”,
“1980_1989”,
“1964_1979”
]

综合排序、热门、更新时间

url_sort = [
“24”,
“11”,
“4”
]

#网站
url_site = [
“”,
“iqiyi”
]

#页面数量
url_page = [
“1”,
“2”,
“3”,
“4”,
“5”,
“6”,
“7”,
“8”,
“9”,
“10”,
“11”,
“12”,
“13”,
“14”,
“15”
]
url_other = [
“1”
]

#状态码
status_code =[404,302,200]

ip_slices=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,
51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,
101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,
151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,
201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254
]

#ip地址
def sample_ip():
slice = random.sample(ip_slices,4)
return “.”.join([str(item) for item in slice])

def sample_url():
return random.sample(url_paths,1)[0]

def sample_area():
return random.sample(url_area,1)[0]

def sample_style():
return random.sample(url_style,1)[0]

def sample_price():
return random.sample(url_price,1)[0]

def sample_time():
return random.sample(url_time,1)[0]

def sample_sort():
return random.sample(url_sort,1)[0]

def sample_site():
return random.sample(url_site,1)[0]

def sample_page():
return random.sample(url_page,1)[0]

def sample_other():
return random.sample(url_other,1)[0]

def sample_status():
return random.sample(status_code,1)[0]

#产生log
def generate_log(count=10):
time_str = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
f = open(“D:\data\data”,“w+”)
# f = open("/home/ubuntu/data/log.csv",“a+”)
while count >= 1:
query_log = “{ip}\t{localtime}\t"GET {url}/{area}-{style}----------{price}-{time}–{sort}-{page}-{other}-{site}–.html”\t{status1}".format(ip=sample_ip(),localtime=time_str,
url=sample_url(),area=sample_area(),style=sample_style(),
price=sample_price(),time=sample_time(),sort=sample_sort(),page=sample_page(),other=sample_other(),site=sample_site(),
status1=sample_status())
#print query_log
f.write(query_log+"\n")
count = count-1;

if name == ‘main’:
generate_log(1000000)

你可能感兴趣的:(python,bigdata)