Skip to content

Commit

Permalink
[BUGFIX] Mapping mechanism
Browse files Browse the repository at this point in the history
In this commit, two kinds of mapping problems are fixed:
(1) OP-KERNEL Mapping. The mapping mechanisms has been rewritten
here. The previous one would cause some errors.
(2) Tracer-Counter in tensorflow. We found that when tracer and
counter run on a computational graph in a process, they interact with
each other. And the kernel is not exactly same when running some ops.
So we define metrics xxInfoTracer and xxInfoCounter to measure
separately.

Signed-off-by: YushuoEdge <[email protected]>
  • Loading branch information
YushuoEdge committed Apr 28, 2022
1 parent 547cf32 commit 138124c
Show file tree
Hide file tree
Showing 17 changed files with 124 additions and 77 deletions.
82 changes: 40 additions & 42 deletions src/amanda/profiler/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from torch import take
from utils import findTopK

def drawRoofline(hardwareTFlops, hardwareIntensity, X, Y, op=False):
Expand Down Expand Up @@ -67,28 +68,32 @@ def kernelInfoTracer(opList, timeList, apiList, rtList):
if x.kind == "KERNEL" or x.kind == "CONC KERNEL":
kernelList.append(x)

infoTime = []
for i in range(len(opList)):
infoTime.append(["OP", opList[i], timeList[i]])
for i in range(len(launchKernelApiList)):
infoTime.append(["KERNEL", launchKernelApiList[i].name, launchKernelApiList[i].startTime])

def takeTime(elem):
return elem[2]

infoTime.sort(key=takeTime)

# Tracer match kernel and op
infoList = []
opIndex = 0
opIndex = -1
kernelIndex = 0
timeListLen = len(timeList)
for i in range(len(launchKernelApiList)):
record = []
if (opIndex < timeListLen - 1 and launchKernelApiList[i].startTime > timeList[opIndex + 1]):
kernelNum = 0
for i in range(len(infoTime)):
if infoTime[i][0] == "OP":
opIndex += 1
kernelIndex = 0

record.append(opIndex)
record.append(opList[opIndex])

record.append(kernelIndex)
record.append(kernelList[i].kind)
record.append(kernelList[i].name)
record.append(launchKernelApiList[i].durationTime)
record.append(kernelList[i].durationTime)

infoList.append(record)
kernelIndex += 1
else:
infoList.append([opIndex, opList[opIndex], kernelIndex, kernelList[kernelNum].kind,
kernelList[kernelNum].name, launchKernelApiList[kernelNum].durationTime,
kernelList[kernelNum].durationTime])
kernelIndex += 1
kernelNum += 1

# Find Top-K kernel according to kernel execution time
k = min(20, len(kernelList))
Expand Down Expand Up @@ -206,33 +211,26 @@ def opInfoTracer(opList, startTimeList, endTimeList, apiList, rtList):
kernelList.append(x)

# Calculate number of kernels for each op
kernelNumList = []
kernelCount = 0
opIndex = 0
kernelIndex = 0
timeListLen = len(startTimeList)

while opIndex < timeListLen - 1 and startTimeList[opIndex] < launchKernelApiList[0].startTime and startTimeList[opIndex+1] < launchKernelApiList[0].startTime:
kernelNumList.append(0)
opIndex += 1
infoTime = []
for i in range(len(opList)):
infoTime.append(["OP", opList[i], startTimeList[i]])
for i in range(len(launchKernelApiList)):
infoTime.append(["KERNEL", launchKernelApiList[i].name, launchKernelApiList[i].startTime])

while kernelIndex < len(launchKernelApiList):
if (opIndex < timeListLen - 1 and launchKernelApiList[kernelIndex].startTime > startTimeList[opIndex + 1]):
kernelNumList.append(kernelCount)
opIndex += 1
kernelCount = 0
continue
kernelCount += 1
kernelIndex += 1
kernelNumList.append(kernelCount)
def takeTime(elem):
return elem[2]
infoTime.sort(key=takeTime)

while opIndex < timeListLen - 1:
kernelNumList.append(0)
opIndex += 1

for i in range(len(opList)):
print(opList[i])
print(kernelNumList[i])
kernelNumList = []
kernelCountOp = 0
for i in range(len(infoTime)):
if infoTime[i][0] == "OP":
kernelNumList.append(kernelCountOp)
kernelCountOp = 0
else:
kernelCountOp += 1
kernelNumList.append(kernelCountOp)
kernelNumList = kernelNumList[1:]

# Get information for each op
kernelCount = 0
Expand Down
2 changes: 1 addition & 1 deletion src/amanda/profiler/pytorch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from amanda_tracer import *
from amanda_counter import *
from profiler import *
from amanda_profiler import *
3 changes: 0 additions & 3 deletions src/amanda/profiler/pytorch/amanda_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@ def forward_instrumentation(self, context: amanda.OpContext):
op = context.get_op()
self.opCount += 1

# if self.opCount > 10:
# return

self.opList.append(op.__name__)
context.insert_before_op(
self.start_profiling,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from metrics import kernelRoofline, opRoofline
from torchMetrics import kernelInfo, opInfo

class Profiler():
class amandaProfiler():
def __init__(self, metric) -> None:
self.__metric = metric

Expand Down
3 changes: 0 additions & 3 deletions src/amanda/profiler/pytorch/amanda_tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ def forward_instrumentation(self, context: amanda.OpContext):
op = context.get_op()
self.opCount += 1

# if self.opCount > 10:
# return

self.opList.append(op.__name__)
context.insert_before_op(
self.init_trace,
Expand Down
4 changes: 2 additions & 2 deletions src/amanda/profiler/pytorch/test/kernelInfo_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torchvision

sys.path.append("..")
from profiler import Profiler
from amanda_profiler import amandaProfiler

def main():

Expand All @@ -15,7 +15,7 @@ def main():
x = torch.rand((32, 3, 227, 227)).to(device)

metric = "KernelInfo"
profiler = Profiler(metric)
profiler = amandaProfiler(metric)
profiler.setConfigs(metric=metric, supplyInfo=[])

with amanda.tool.apply(profiler.counter):
Expand Down
4 changes: 2 additions & 2 deletions src/amanda/profiler/pytorch/test/kernelRoofline_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torchvision

sys.path.append("..")
from profiler import Profiler
from amanda_profiler import amandaProfiler

def main():

Expand All @@ -16,7 +16,7 @@ def main():
metric = "KernelRoofline"
# Nvidia Geforce RTX 2080 Ti: 1350MHz, 13.45 Single-Precision TFlops, 616GB/s
supplyInfo = [1350, 13.45, 616]
profiler = Profiler(metric)
profiler = amandaProfiler(metric)
profiler.setConfigs(metric=metric, supplyInfo=supplyInfo)

with amanda.tool.apply(profiler.counter):
Expand Down
4 changes: 2 additions & 2 deletions src/amanda/profiler/pytorch/test/opInfo_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torchvision

sys.path.append("..")
from profiler import Profiler
from amanda_profiler import amandaProfiler

def main():

Expand All @@ -15,7 +15,7 @@ def main():
x = torch.rand((32, 3, 227, 227)).to(device)

metric = "OpInfo"
profiler = Profiler(metric)
profiler = amandaProfiler(metric)
profiler.setConfigs(metric=metric, supplyInfo=[])

with amanda.tool.apply(profiler.counter):
Expand Down
4 changes: 2 additions & 2 deletions src/amanda/profiler/pytorch/test/opRoofline_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torchvision

sys.path.append("..")
from profiler import Profiler
from amanda_profiler import amandaProfiler

def main():

Expand All @@ -17,7 +17,7 @@ def main():
metric = "OpRoofline"
# Nvidia Geforce RTX 2080 Ti: 1350MHz, 13.45 Single-Precision TFlops, 616GB/s
supplyInfo = [1350, 13.45, 616]
profiler = Profiler(metric)
profiler = amandaProfiler(metric)
profiler.setConfigs(metric=metric, supplyInfo=supplyInfo)

with amanda.tool.apply(profiler.counter):
Expand Down
2 changes: 1 addition & 1 deletion src/amanda/profiler/pytorch/torchMetrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def kernelInfo(opList, timeList, apiList, rtList, dataList):
res.to_csv("./Experiments/kernelInfo_result.csv", index=False, sep=',')


# Now Information: OpIndex, OpName, TotalExecutionTime, MaxKernelIndex, MaxKernelName, MaxKernelExecutionTime, kernelNumTracer, kernelNumCounter, TotalDramRead, TotalDramWrite, TotalSFOp, TotalElapsedCycles
# Now Information: OpIndex, OpName, TotalExecutionTime, MaxKernelIndex, MaxKernelName, MaxKernelExecutionTime, kernelNumTracer, kernelNumCounter, TotalElapsedCycles, TotalDramRead, TotalDramWrite, TotalSFOp
def opInfo(opList, startTimeList, endTimeList, apiList, rtList, dataList):

# Information collected by tracer: OpIndex, OpName, TotalExecutionTime, MaxKernelIndex, MaxKernelName, MaxKernelExecutionTime, kernelNumTracer
Expand Down
2 changes: 1 addition & 1 deletion src/amanda/profiler/tensorflow/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from amanda_tracer import *
from amanda_counter import *
from profiler import *
from amanda_profiler import *
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

from utils import setConfigsMetric
from tfMetrics import kernelInfo, opInfo
from metrics import opInfoTracer, opInfoCounter, kernelInfoTracer, kernelInfoCounter

class Profiler():
class amandaProfiler():
def __init__(self, metric) -> None:
self.__metric = metric

Expand Down Expand Up @@ -45,7 +46,7 @@ def setConfigs(self, metric, supplyInfo, onlineOnly=False, offlineOnly=False):
def createSessionTracer(self):
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)

config=tf.ConfigProto(gpu_options=gpu_options, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
config=tf.ConfigProto(gpu_options=gpu_options, intra_op_parallelism_threads=0, inter_op_parallelism_threads=1)
session = tf.Session(config=config)
return session

Expand All @@ -57,25 +58,41 @@ def createSessionCounter(self):
return session

def showResults(self):
if self.__metric == "KernelInfo":
if self.__metric.find("KernelInfo") != -1:
self.tracer.activityFlushAll()
self.opListTracer = self.tracer.opList
self.opListCounter = self.counter.opList
self.startTimeList = self.tracer.getStartTimeLists()
self.traceDataRt = self.tracer.getTraceDataRt()
self.traceDataApi = self.tracer.getTraceDataApi()
self.countData = self.counter.getCountData()
kernelInfo(self.opListTracer, self.opListCounter, self.startTimeList, self.traceDataApi, self.traceDataRt, self.countData)
if self.__metric == "KernelInfo":
kernelInfo(self.opListTracer, self.opListCounter, self.startTimeList, self.traceDataApi, self.traceDataRt, self.countData)
elif self.__metric == "KernelInfoTracer":
kernelInfoTracer(self.opListTracer, self.startTimeList, self.traceDataApi, self.traceDataRt)
elif self.__metric == "KernelInfoCounter":
kernelInfoCounter(self.countData, flopCount=False)
else:
sys.exit("Profiler.Metric: " + self.__metric + " not supported")
return

if self.__metric == "OpInfo":
if self.__metric.find("OpInfo") != -1:
self.tracer.activityFlushAll()
self.opListTracer = self.tracer.opList
self.opListCounter = self.counter.opList
self.startTimeList = self.tracer.getStartTimeLists()
self.endTimeList = self.tracer.getEndTimeLists()
self.traceDataRt = self.tracer.getTraceDataRt()
self.traceDataApi = self.tracer.getTraceDataApi()
self.countData = self.counter.getCountData()
opInfo(self.opListTracer, self.opListCounter, self.startTimeList, self.endTimeList, self.traceDataApi, self.traceDataRt, self.countData)
if self.__metric == "OpInfo":
opInfo(self.opListTracer, self.opListCounter, self.startTimeList, self.endTimeList, self.traceDataApi, self.traceDataRt, self.countData)
elif self.__metric == "OpInfoTracer":
opInfoTracer(self.opListTracer, self.startTimeList, self.endTimeList, self.traceDataApi, self.traceDataRt)
elif self.__metric == "OpInfoCounter":
opInfoCounter(self.countData, flopCount=False)
else:
sys.exit("Profiler.Metric: " + self.__metric + " not supported")
return

sys.exit("Profiler.Metric: " + self.__metric + " not supported")
Expand Down
4 changes: 3 additions & 1 deletion src/amanda/profiler/tensorflow/amanda_tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def forward_instrumentation(self, context: amanda.OpContext):
if not tensor.dtype._is_ref_dtype
]

if len(op_outputs) != 0 and len(op_inputs) != 0:
if len(op_outputs) != 0 and len(op_inputs) != 0 and op.name.find("Initializer") == -1:
# if len(op_outputs) != 0 and len(op_inputs) != 0 and op.name.find("Relu") != -1:
self.opList.append(op.name)
context.insert_before_op(
Expand All @@ -55,6 +55,7 @@ def forward_instrumentation(self, context: amanda.OpContext):

def init_trace(self, *inputs, op):
def extract_fn(*inputs):
self.activityFlushAll()
self.tracer.initTrace()
self.beforeCount += 1
return inputs
Expand All @@ -70,6 +71,7 @@ def extract_fn(*inputs):
def finish_trace(self, *outputs, op):
def extract_fn(*outputs):
self.tracer.finishTrace()
self.tracer.activityFlushAll()
self.afterCount += 1
return outputs

Expand Down
6 changes: 3 additions & 3 deletions src/amanda/profiler/tensorflow/test/kernelInfo_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
from examples.common.tensorflow.model.resnet_50 import ResNet50

sys.path.append("..")
from profiler import Profiler
from amanda_profiler import amandaProfiler

def main():

model = ResNet50()
x = tf.random.uniform(shape = [1, 224, 224, 3])
y = model(x)

metric = "KernelInfo"
profiler = Profiler(metric)
metric = "KernelInfoTracer"
profiler = amandaProfiler(metric)
profiler.setConfigs(metric=metric, supplyInfo=[])

session_ = profiler.createSessionCounter()
Expand Down
6 changes: 3 additions & 3 deletions src/amanda/profiler/tensorflow/test/opInfo_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
from examples.common.tensorflow.model.resnet_50 import ResNet50

sys.path.append("..")
from profiler import Profiler
from amanda_profiler import amandaProfiler

def main():

model = ResNet50()
x = tf.random.uniform(shape = [1, 224, 224, 3])
x = tf.random.uniform(shape = [16, 224, 224, 3])
y = model(x)

metric = "OpInfo"
profiler = Profiler(metric)
profiler = amandaProfiler(metric)
profiler.setConfigs(metric=metric, supplyInfo=[])

session_ = profiler.createSessionCounter()
Expand Down
16 changes: 12 additions & 4 deletions src/amanda/profiler/tracer/activity_trace_async.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -792,16 +792,24 @@ void tracer::initTrace()
// Some attributes require to be set before any CUDA context is created to be effective,
// e.g. to be applied to all device buffer allocations (see documentation).
CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue));
printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE", (long long unsigned)attrValue);
// printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE", (long long unsigned)attrValue);
attrValue *= 2;
CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE, &attrValueSize, &attrValue));

CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue));
printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT", (long long unsigned)attrValue);
// printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT", (long long unsigned)attrValue);
attrValue *= 2;
CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue));

CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE, &attrValueSize, &attrValue));
printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_SIZE", (long long unsigned)attrValue);
// printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_SIZE", (long long unsigned)attrValue);
attrValue *= 2;
CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue));

CUPTI_CALL(cuptiActivityGetAttribute(CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT, &attrValueSize, &attrValue));
printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT", (long long unsigned)attrValue);
// printf("%s = %llu\n", "CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT", (long long unsigned)attrValue);
attrValue *= 2;
CUPTI_CALL(cuptiActivitySetAttribute(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT, &attrValueSize, &attrValue));

CUPTI_CALL(cuptiGetTimestamp(&startTimestamp));
traceFile << "Update startTimeStamp: " << startTimestamp << " thread: " << pthread_self() << std::endl;
Expand Down
Loading

0 comments on commit 138124c

Please sign in to comment.