Commit 82a201d9 authored by Long Chung Chan's avatar Long Chung Chan
Browse files

added basic files

parent 28163e79
# Partitioning FPGA-Optimized Systolic Arrays
We provide a fast optimization algorithm and a step-to-step guide on how to generate the dataset
for a specific board and topologies to be used by our optimization tool.
> *Long Chung Chan, Gurshaant Singh Malik and Nachiket Kapre*
> [**"Partitioning FPGA-Optimized Systolic Arrays for Fun and Profit"**](),
> 2019 International Conference on Field-Programmable Technology
# TODO
- [] Add a public link to the paper
- [] Basic Demo Flow
- [] Step-by-step guide on
- [] their own data set
- [] running the optimzation algo on the generated dataset
- [] What each files/directories are responsible for
- [] Changes done to ScaleSim
- [] Explanation of the testing result
### Instructions for Demos:
#### Functional Demos :
##### Pre-Requisites:
Please see the
##### Instructions on running the functional demos.
##### Understanding the output of the demo
#### Running your workload with our NoC
### Referring to the data from the paper
To refer to the data quoted in the paper, please go to the [results directory](https://git.uwaterloo.ca/watcag-public/bft-flow/tree/master/results), where you
will be able to access data grouped by performance and hardware mapping respectively.
### License
This tool is distributed under MIT license.
Copyright (c) 2019 Gurshaant Singh Malik, Nachiket Kapre
<div style="text-align: justify;">
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
<br><br>
</div>
<div style="text-align: justify;">
<b>The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.</b>
<br><br>
</div>
<div style="text-align: justify;">
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
</div>
\ No newline at end of file
import sys
import time
import csv
from tqdm import tqdm
from itertools import combinations
class brute_force(object):
def __init__(self,
# data path
path_to_datasrc = "googlenet_mem_bound.csv",
path_to_topology = "googlenet.csv",
target_col = "DRAM_cycle",
# problem definition
number_of_partition = 4,
# constraint
max_res_available = 960, initial_res = 0,
res_step = 1,
):
self.topology_file = path_to_topology
self.k = number_of_partition
self.best_candidate = [0] * number_of_partition
self.max_res_unit = max_res_available
self.res_step = res_step
self.target_col = target_col
self.data_src = {}
self.layers = self.parse_topology_file()
self.parse_data_set_file(path_to_datasrc)
self.ending_iter = 0
self.start = time.time()
def parse_topology_file(self):
layers = []
with open(self.topology_file, 'r') as f:
next(f)
for line in f:
elems = line.strip().split(',')
layers.append(elems[0])
for layer in layers:
self.data_src[layer] = {}
return layers
def parse_data_set_file(self, path_to_data_csv):
first = True
target_idx = 2
with open(path_to_data_csv, 'r') as f:
for line in f:
elems = line.strip().split(',')
# #print(elems)
if first:
for idx, col in enumerate(elems):
if self.target_col in col:
target_idx = idx
break
first = False
else:
self.data_src[elems[1]][int(elems[0])] = int(float(elems[target_idx]))
def decode_gene(self, gene):
gene = list(gene)
solution_layer_domain = []
part = []
idx = 0
for idx, l in enumerate(self.layers):
if not gene:
part = self.layers[idx:]
solution_layer_domain.append(part)
break
part.append(l)
if idx == gene[0]:
solution_layer_domain.append(part)
part = []
gene.pop(0)
return solution_layer_domain
def generate_gene(self):
return list(combinations(list(range(len(self.layers) - 1)), self.k-1))
# for g in genes:
# print(g, self.decode_gene(g))
# print("sizes:", len(genes))
def find_max_latency(self, layer_partition, res_partitions):
latencies = [0] * len(layer_partition)
max_latency_idx = 0
for idx, part in enumerate(layer_partition):
res = res_partitions[idx]
for layer in part:
latencies[idx] += self.data_src[layer][res]
if latencies[idx] > latencies[max_latency_idx]:
max_latency_idx = idx
return latencies, max_latency_idx
def evaluate_hybird(self, gene):
layer = self.decode_gene(gene)
res = [self.res_step] * self.k
latencies = []
for _ in range(0, int(self.max_res_unit/self.res_step - self.k*self.res_step)):
latencies, max_idx = self.find_max_latency(layer, res)
res[max_idx] += self.res_step
return latencies[max_idx], latencies, res, layer, gene
def run(self):
min_latency = 100000000
for g in tqdm(self.generate_gene()):
lat, _, _, _, _ = self.evaluate_hybird(g)
if lat < min_latency:
self.best_candidate = g
min_latency = lat
self.end = time.time()
def report(self):
max_latency = 0
layer = []
res = []
latencies = []
full_latency, full_max_idx = self.find_max_latency([self.layers], [self.max_res_unit]*len(self.layers))
max_latency = 100000000
max_latency, latencies, res, layer, _ = self.evaluate_hybird(self.best_candidate)
print("================================= RESULT =================================")
print("Layer assignment:")
print(layer)
print("Res mapping:")
print(res)
print("Latency for each partition: ")
print(latencies)
print("Final Latency:", max_latency*self.k, "|| Throught put:", 1/max_latency)
print("==========================================================================")
print("Map to full array (", self.max_res_unit, ")")
print("Final Latency:", full_latency[full_max_idx], "|| Throught put:", 1/full_latency[full_max_idx])
print("==========================================================================")
print("Throughtput Ratio:", (1/max_latency)/(1/full_latency[full_max_idx]))
print("Latency increase:", (max_latency*self.k)/full_latency[full_max_idx])
with open('bruteforce.csv', 'a') as csvFile:
writer = csv.writer(csvFile, delimiter=',', lineterminator="\n")
writer.writerow([self.target_col,self.k, self.topology_file, 1,(1/max_latency), max_latency*self.k, 1/full_latency[full_max_idx], full_latency[full_max_idx], (1/max_latency)/(1/full_latency[full_max_idx]), (max_latency*self.k)/full_latency[full_max_idx], layer, res, self.end-self.start, self.max_res_unit])
csvFile.close
if __name__ == "__main__":
print("Brute Force")
# python3 brute_force_approach.py googlenet 20 960 DRAM_cycle
topology = sys.argv[1]
k = int(sys.argv[2])
max_res_unit = int(sys.argv[3])
target_col = sys.argv[4]
bf = brute_force(
path_to_datasrc = str(topology)+"_mem_bound.csv",
path_to_topology = str(topology)+".csv",
target_col = str(target_col),
number_of_partition = k,
max_res_available = max_res_unit, initial_res = 0,
res_step = 1,
)
# bf.generate_gene()
bf.run()
bf.report()
import cma
import time
import path_constant as pc
from multiprocessing import Pool
from os import cpu_count
class cma_approach(object):
def __init__(self,
# data path
path_to_datasrc = "alexnet_data.csv",
path_to_topology = "alexnet.csv",
target_col = "Cycles",
# problem definition
number_of_partition = 4, max_iteration = 100,
sigma = 0.5, population_size = 10,
# constraint
max_res_unit = 960, initial_res = 0,
res_step = 1,
penalty_offest = 10000000000,
seeding_type="optimised",
hybird = True
):
self.target_col = target_col
self.start = time.time()
self.k = number_of_partition
self.max_iter = max_iteration
self.sigma = sigma
self.max_res_unit = max_res_unit
self.res_step = res_step
self.population_size = population_size
self.penalty_offest = penalty_offest
self.ending_iter = 0
self.is_hybird = hybird
self.data_src = {}
self.topology_file = path_to_topology
self.layers = self.parse_topology_file()
self.parse_data_set_file(path_to_datasrc)
self.best_layer = number_of_partition * [0]
self.best_res = number_of_partition * [0]
self.total_valid_solution = 0
self.trial = 1
self.seeding_type = seeding_type
def parse_topology_file(self):
layers = []
with open(self.topology_file, 'r') as f:
next(f)
for line in f:
elems = line.strip().split(',')
layers.append(elems[0])
for layer in layers:
self.data_src[layer] = {}
return layers
def parse_data_set_file(self, path_to_data_csv):
first = True
target_idx = 2
with open(path_to_data_csv, 'r') as f:
for line in f:
elems = line.strip().split(',')
# print(elems)
if first:
for idx, col in enumerate(elems):
if self.target_col in col:
target_idx = idx
break
first = False
else:
self.data_src[elems[1]][int(elems[0])] = int(float(elems[target_idx]))
def regroup_layers(self, sample):
# #print("DEBUG", sample)
detail_sample = []
idx = 0
for size in sample:
part = []
if size == 1:
part.append(self.layers[idx])
idx += 1
else:
for i in range(0, size):
part.append(self.layers[i + idx])
idx += size
detail_sample.append(part)
return detail_sample
def decode(self, val, max_val):
return int(val * max_val)
def encode(self, val, max_val):
return float(val / max_val)
def filter_layer(self, layer):
for idx in range(self.k):
if layer[idx] <= 0:
return False
if sum(layer) != len(self.layers):
return False
return True
def filter_res(self, res):
# #print(layer, res)
for idx in range(self.k):
if res[idx] <= 0:
return False
if sum(res) != self.max_res_unit:
return False
return True
def penalty_layer(self, layer):
penalty_score = self.penalty_offest
if sum(layer) != len(self.layers):
penalty_score += self.penalty_offest
else:
layer = [abs(val) for val in layer]
for idx in range(self.k):
if layer[idx] <= 0:
penalty_score *= 1.05
percent_diff = (abs(sum(layer) - len(self.layers)) / len(self.layers))
penalty_score += percent_diff * self.penalty_offest
return penalty_score
def penalty_res(self, res):
penalty_score = self.penalty_offest
if sum(res) != self.max_res_unit:
penalty_score += self.penalty_offest
else:
res = [abs(val) for val in res]
for idx in range(self.k):
if res[idx] <= 0:
penalty_score *= 1.05
percent_diff = abs(sum(res) - self.max_res_unit) / self.max_res_unit
penalty_score += percent_diff * self.penalty_offest
return penalty_score
def find_max_latency(self, layer_partition, res_partitions):
latencies = [0] * len(layer_partition)
max_latency_idx = 0
for idx, part in enumerate(layer_partition):
res = res_partitions[idx]
for layer in part:
latencies[idx] += self.data_src[layer][res]
if latencies[idx] > latencies[max_latency_idx]:
max_latency_idx = idx
return latencies, max_latency_idx
def evaluate_hybird(self, layer):
res = [self.res_step] * self.k
latencies = []
for i in range(0, int(self.max_res_unit/self.res_step - self.k*self.res_step)):
latencies, max_idx = self.find_max_latency(layer, res)
res[max_idx] += self.res_step
return latencies[max_idx], latencies, res, layer
def evaluate_full_relaxed(self, layer):
seed = []
for i in range(self.k - 1):
seed.append(int(self.max_res_unit/self.k))
seed.append(self.max_res_unit - sum(seed))
# #print(seed)
seed = [self.encode(val, self.max_res_unit) for val in seed[:-1]]
es_res = cma.CMAEvolutionStrategy(seed, \
self.sigma, {'popsize' : self.population_size})
i = 0
while not es_res.stop() and i < self.max_iter:
samples = es_res.ask()
scores = [0] * es_res.popsize
res = [0] * es_res.popsize
for idx, sample in enumerate(samples):
res_assign = [self.decode(val, self.max_res_unit) for val in sample]
res_assign.append(self.max_res_unit - sum(res_assign))
res[idx] = res_assign
for idx, r in enumerate(res):
if self.filter_res(r):
latencies, max_idx = self.find_max_latency(layer, r)
scores[idx] = latencies[max_idx]
else:
scores[idx] = self.penalty_res(r)
# for idx in range(self.population_size):
# #print(samples[idx], scores[idx])
es_res.tell(samples, scores)
i += 1
res = [self.decode(val, self.max_res_unit) for val in es_res.result[0]]
res.append(self.max_res_unit - sum(res))
if self.filter_res(r):
latencies, max_idx = self.find_max_latency(layer, res)
else:
max_latency = self.penalty_res(r)
latencies = [max_latency]*self.k
max_idx = 0
return latencies[max_idx], latencies, res, layer
def evaluation_top_level(self, in_val):
pid, sampling = in_val
layer = [self.decode(val, len(self.layers)) for val in sampling]
layer.append(len(self.layers) - sum(layer))
penalty = 0
if not self.filter_layer(layer):
penalty = self.penalty_layer(layer)
if self.is_hybird:
return pid, penalty
else:
return pid, penalty*4
layer = self.regroup_layers(layer)
if self.is_hybird:
return pid, self.evaluate_hybird(layer)[0]
else:
score, _, res, _ = self.evaluate_full_relaxed(layer)
return pid, score, res
def run(self):
self.trial += self.trial
if (self.seeding_type=="allzeros"):
self.seed = [0]*(self.k-1)
self.seed_od = self.seed
elif (self.seeding_type=="optimised"):
self.seed = []
for i in range(self.k - 1):
self.seed.append(int(len(self.layers)/self.k))
self.seed.append(len(self.layers) - sum(self.seed))
self.seed_od = self.seed
self.seed = [self.encode(val, len(self.layers)) for val in self.seed[:-1]]
else:
raise ValueError('Invalid Seeding Strategy')
self.es = cma.CMAEvolutionStrategy(self.seed, self.sigma, \
{'popsize' : self.population_size})
best_overall = self.penalty_offest
self.i = 0
while not self.es.stop() and self.i < self.max_iter:
samples = self.es.ask()
id_list = [(idx, sample) for idx, sample in enumerate(samples)]
scores = [0] * self.es.popsize
invalid_sampling = 0
res_combintaions = [0] * self.es.popsize
pool = Pool(processes = cpu_count() - 4)
for result in pool.imap_unordered(self.evaluation_top_level, id_list):
scores[result[0]] = result[1]
if result[1] >= self.penalty_offest:
invalid_sampling += 1
else:
if not self.is_hybird:
res_combintaions[result[0]] = result[2]
pool.close()
pool.join()
if not self.is_hybird:
best_in_iteration = min(scores)
if best_in_iteration < best_overall and best_in_iteration < self.penalty_offest:
best_overall = best_in_iteration
self.best_res = res_combintaions[scores.index(min(scores))]
##print(str(self.i) + ":", \
# "Sigma:",round(self.es.sigma, 4), \
# "|| Valid sampling percentage:", \
# (self.population_size - invalid_sampling) /self.population_size *100)
##print("invalid sampling", invalid_sampling)
self.valid_sampling_percentage = (self.population_size - invalid_sampling) /self.population_size *100
self.total_valid_solution += self.population_size - invalid_sampling
self.es.tell(samples, scores)
self.end = time.time()
self.best_layer = [self.decode(val, len(self.layers)) for val in self.es.result[0]]
self.best_layer.append(len(self.layers) - sum(self.best_layer))
self.report()
self.i += 1
self.ending_iter = self.i
def report(self):
##print(self.i, self.es.sigma)
max_latency = 0
layer = []
res = []
latencies = []
if self.is_hybird:
if not self.filter_layer(self.best_layer):
##print("RESULT NOT VALID")
##print("Layer:", self.best_layer, "sum: ", sum(self.best_layer))
#print(self.penalty_layer(self.best_layer))
with open(pc.RESULT_CSV_PATH+'cma.csv', 'a') as csvFile:
writer = csv.writer(csvFile, delimiter=',', lineterminator="\n")
writer.writerow([self.target_col,self.i,self.k, self.topology_file, 0,0, 0, 0, 0, 0, 0, layer, res, self.end-self.start, self.es.</