One of the challenging parts of paper was creating the traces to train a better model. In this blog are scripts to generate these traces. We’ve also included the raw data in our git repository so that you don’t have run these scripts…they rely on downloading large amounts of data.
$ wget http://data.fcc.gov/download/measuring-broadband-america/2019/data-raw-2019-feb.tar.gz
$ tar -zxvf data-raw-2019-feb.tar.gz -C FCC
$ wget -r --no-parent --reject "index.html*" http://home.ifi.uio.no/paalh/dataset/hsdpa-tcp-logs/
$ wget http://users.ugent.be/~jvdrhoof/dataset-4g/logs/logs_all.zip
$ unzip logs_all.zip -d belgium
通过下载三个数据集,我们发现以下几个特征:
Now you can convert these trace into datasets for trainning using the below code. There will be more than 1000 traces here, so you can even shuffle the lines from datasets and pick a smaller subset.
import os
import shutil
src_PATH = "../../datasets/original_network/3G_HSDPA/hsdpa-tcp-logs/"
dst_PATH = "../../datasets/original_network/3G_HSDPA/hsdpa/"
def copy_file(srcfile,dstfile):
if not os.path.isfile(srcfile):
print("%s not exist!"%(srcfile))
else:
fpath,fname=os.path.split(dstfile)
if not os.path.exists(fpath):
os.makedirs(fpath)
shutil.copyfile(srcfile,dstfile)
print("move %s -> %s"%( srcfile,dstfile))
def main():
folders = os.listdir(src_PATH)
for folder in folders:
filename_list = os.listdir(src_PATH + folder)
for i in range(len(filename_list)):
fpath = src_PATH + folder +'/'
os.rename(fpath + filename_list[i], fpath + folder + "." + filename_list[i])
for folder in folders:
filename_list = os.listdir(src_PATH + folder)
for i in range(len(filename_list)):
fpath = src_PATH + folder +'/'
copy_file(fpath + filename_list[i], dst_PATH + filename_list[i])
if __name__ == '__main__':
main()
import numpy as np
import matplotlib.pyplot as plt
BITS_IN_BYTE = 8.0
MBITS_IN_BITS = 1000000.0
MILLISECONDS_IN_SECONDS = 1000.0
LINK_FILE = "../../datasets/original_network/3G_HSDPA/hsdpa/bus.ljansbakken-oslo.report.2010-09-28_1407CEST.log"
def plot_log_bandwidth():
time_ms = []
bytes_recv = []
recv_time = []
with open(LINK_FILE, 'r') as f:
for line in f:
parse = line.split()
time_ms.append(float(parse[1]))
bytes_recv.append(float(parse[-2]))
recv_time.append(float(parse[-1]))
time_ms = np.array(time_ms)
bytes_recv = np.array(bytes_recv)
recv_time = np.array(recv_time)
throughput_all = bytes_recv / recv_time
time_ms = time_ms - time_ms[0]
time_ms = time_ms / MILLISECONDS_IN_SECONDS
throughput_all = throughput_all * BITS_IN_BYTE / MBITS_IN_BITS * MILLISECONDS_IN_SECONDS
plt.plot(time_ms, throughput_all)
plt.xlabel('Time (second)')
plt.ylabel('Throughput (Mbit/sec)')
plt.show()
plot_log_bandwidth()
import os
import numpy as np
DATA_PATH = "../../datasets/original_network/3G_HSDPA/hsdpa/"
OUTPUT_PATH = "../../datasets/original_network/3G_HSDPA/throughput/"
MILLISEC_IN_SEC = 1000.0
MBITS_IN_BITS = 1000000.0
BITS_IN_BYTE = 8.0
def calculate_network_tp():
files = os.listdir(DATA_PATH)
for f in files:
file_path = DATA_PATH + f
output_path = OUTPUT_PATH + f
print(file_path)
with open(file_path, 'r') as f, open(output_path, 'w') as mf:
time = []
for line in f:
parse = line.split()
if len(time) > 0 and float(parse[1]) < time[-1]: # trace error, time not monotonically increasing
break
time.append(float(parse[1]))
time_ms = (float(parse[1]) - time[0])/MILLISEC_IN_SEC
throughput_bpms = float(parse[-2]) / float(parse[-1]) * BITS_IN_BYTE / MBITS_IN_BITS * MILLISEC_IN_SEC
mf.write(str(time_ms) + str(" ") + str(throughput_bpms) + '\n')
calculate_network_tp()
For actual video streaming over Mahimahi (http://mahimahi.mit.edu), the format of the traces should be converted to the following
import os
import numpy as np
DATA_PATH = "../../datasets/original_network/3G_HSDPA/hsdpa/"
OUTPUT_PATH = "../../datasets/original_network/3G_HSDPA/mahimahi/"
BYTES_PER_PKT = 1500.0
MILLISEC_IN_SEC = 1000.0
BITS_IN_BYTE = 8.0
def main():
files = os.listdir(DATA_PATH)
for f in files:
file_path = DATA_PATH + f
output_path = OUTPUT_PATH + f
print(file_path)
with open(file_path, 'r') as f, open(output_path, 'w') as mf:
time_ms = []
bytes_recv = []
recv_time = []
for line in f:
parse = line.split()
if len(time_ms) > 0 and float(parse[1]) < time_ms[-1]: # trace error, time not monotonically increasing
break
time_ms.append(float(parse[1]))
bytes_recv.append(float(parse[-2]))
recv_time.append(float(parse[-1]))
time_ms = np.array(time_ms)
bytes_recv = np.array(bytes_recv)
recv_time = np.array(recv_time)
throughput_all = bytes_recv / recv_time
millisec_time = 0
mf.write(str(millisec_time) + '\n')
for i in range(len(throughput_all)):
throughput = throughput_all[i]
pkt_per_millisec = throughput / BYTES_PER_PKT
millisec_count = 0
pkt_count = 0
while True:
millisec_count += 1
millisec_time += 1
to_send = (millisec_count * pkt_per_millisec) - pkt_count
to_send = np.floor(to_send)
print(pkt_per_millisec, millisec_count, millisec_time, to_send)
for i in range(int(to_send)):
mf.write(str(millisec_time) + '\n')
pkt_count += to_send
if millisec_count >= recv_time[i]:
break
if __name__ == '__main__':
main()
import numpy as np
import matplotlib.pyplot as plt
PACKET_SIZE = 1500.0 # bytes
BITS_IN_BYTE = 8.0
MBITS_IN_BITS = 1000000.0
MILLISECONDS_IN_SECONDS = 1000.0
N = 100
LINK_FILE ="../../datasets/original_network/3G_HSDPA/mahimahi/bus.ljansbakken-oslo.report.2010-09-28_1407CEST.log"
def plot_mahimahi_bandwidth():
time_all = []
packet_sent_all = []
last_time_stamp = 0
packet_sent = 0
with open(LINK_FILE, 'rb') as f:
for line in f:
time_stamp = int(line.split()[0])
if time_stamp == last_time_stamp:
packet_sent += 1
continue
else:
time_all.append(last_time_stamp)
packet_sent_all.append(packet_sent)
packet_sent = 1
last_time_stamp = time_stamp
time_window = np.array(time_all[1:]) - np.array(time_all[:-1])
throuput_all = PACKET_SIZE * \
BITS_IN_BYTE * \
np.array(packet_sent_all[1:]) / \
time_window * \
MILLISECONDS_IN_SECONDS / \
MBITS_IN_BITS
print(throuput_all)
plt.plot(np.array(time_all[1:]) / MILLISECONDS_IN_SECONDS,
np.convolve(throuput_all, np.ones(N,)/N, mode='same'))
plt.xlabel('Time (second)')
plt.ylabel('Throughput (Mbit/sec)')
plt.show()
plot_mahimahi_bandwidth()
In order to verify the generalization of the algorithm, we need to generate some extreme data traces:
# -*- coding: UTF-8 -*-
import os, sys, numpy, random, hashlib
from sympy import Symbol, solve
T_l = 1
T_s = 0.5
cov = 0.5
# of seconds for each tracefile
time_length = 1440
filename = 0
#每组数据的md5,查重用
md5s = []
# get bitrate levels (in Mbps)
min_bitrate = 0.2
max_bitrate = 1.2
steps = 10
bitrate_states_low_var = []
curr = min_bitrate
for x in range(0,steps):
bitrate_states_low_var.append(curr)
curr += ((max_bitrate-min_bitrate)/(steps-1))
x += 1
# list of transition probabilities
transition_probabilities = []
# assume you can go steps-1 states away (we will normalize this to the actual scenario)
eq = -1
x = Symbol("x", positive=True)
for y in range(1, steps-1):
eq += (1/x**y)
res = solve(eq)
switch_parameter = res[0]
#输入的8个同样的进去
for z in range(1, steps-1):
transition_probabilities.append(1/switch_parameter*y)
# two variance(变化) levels
sigma_low = 1.0
sigma_high = 1.0
# probability of switching variance levels
variance_switch_prob = 0.2
# probability to stay in same state!!!!!
prob_stay = 1 - 1 / T_l
# takes a state and decides what the next state is
def transition(state, variance):
transition_prob = random.uniform(0,1)
# pick next variance first
variance_switch = random.uniform(0,1)
next_variance = variance
if ( variance_switch < variance_switch_prob ):
if ( next_variance == sigma_low ):
next_variance = sigma_high
else:
next_variance = sigma_low
if transition_prob < prob_stay: # stay in current state
return (state, next_variance)
else: # pick appropriate state!
next_state = state
curr_pos = state
# first find max distance that you can be from current state
max_distance = max(curr_pos, len(bitrate_states_low_var)-1-curr_pos)
# cut the transition probabilities to only have possible number of steps
curr_transition_probabilities = transition_probabilities[0:max_distance]
trans_sum = sum(curr_transition_probabilities)
normalized_transitions = [x/trans_sum for x in curr_transition_probabilities]
# generate a random number and see which bin it falls in to
trans_switch_val = random.uniform(0,1)
running_sum = 0
num_switches = -1
for ind in range(0, len(normalized_transitions) ):
if ( trans_switch_val <= (normalized_transitions[ind] + running_sum) ): # this is the val
num_switches = ind
break
else:
running_sum += normalized_transitions[ind]
# now check if there are multiple ways to move this many states away
switch_up = curr_pos + num_switches
switch_down = curr_pos - num_switches
if ( switch_down >= 0 and switch_up <= (len(bitrate_states_low_var)-1) ): # can go either way
x = random.uniform(0,1)
if ( x < 0.5 ):
return (switch_up, next_variance)
else:
return(switch_down, next_variance)
elif switch_down >= 0: # switch down
return(switch_down, next_variance)
else: # switch up
return(switch_up, next_variance)
#随机生成一个当前状态同时确定变化的幅度
for g in range(10):
f = open("./network_trace/" + str(filename), "w")
current_state = random.randint(0, len(bitrate_states_low_var) - 1)
current_variance = cov * bitrate_states_low_var[current_state]
time = 0
cnt = 0
while time < time_length:
# prints timestamp (in seconds) and throughput (in Mbits/s)
if cnt <= 0:
noise = numpy.random.normal(0.5, current_variance, 1)[0]
# print("bitrate_states_low_var:" + str(bitrate_states_low_var[current_state]) + " noise:" + str(noise))
cnt = T_s
#修改最小值
gaus_val = max(0.2, bitrate_states_low_var[current_state] + noise)
cnt -= 1
# print(str(time) + " " + str(gaus_val))
f.write(str(time) + " " + str(gaus_val) + "\n")
next_vals = transition(current_state, current_variance)
if current_state != next_vals[0]:
cnt = 0
current_state = next_vals[0]
current_variance = cov * bitrate_states_low_var[current_state]
time += 0.5
f.close()
#进行md5
md5f = open("./network_trace/" + str(filename), "rb")
md5 = hashlib.md5(md5f.read()).hexdigest()
md5s.append(md5)
print(md5)
md5f.close()
filename += 1
for i in md5s:
if md5s.count(i) != 1:
print ("chongfu ge shu "), i
print ("chongfu shu"), list_no.count(i)
break
print("no chongfu")
如有任何问题,欢迎留言.