Merge branch 'gert' into try-merge-gert

Conflicts: Makefile.in src/python/gpuarray.py
mfkiwl · Aug 21, 2008 · b70ee8d · b70ee8d
2 parents 7a87c08 + d9ec83a
commit b70ee8d
Show file tree

Hide file tree

Showing 16 changed files with 1,828 additions and 288 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+.pydevproject
+.project
+.settings
 *~
 .*.sw[po]
 *.dat

diff --git a/doc/source/array.rst b/doc/source/array.rst
@@ -62,3 +62,8 @@ The :class:`GPUArray` Array Class
 
   Same as :func:`empty`, but the :class:`GPUArray` is zero-initialized before
   being returned.
+
+We are also providing a math module which emulates the standard python math module and
+executes all it's methods on the GPU instead of the CPU, if the argument is of a supported
+type. Right now this is only the case for the class GPUArray and classes which inherit GPUArray
+like SimpleArray.
diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst
@@ -97,15 +97,31 @@ Using a :class:`pycuda.gpuarray.GPUArray`, the same effect can be
 achieved with much less writing::
 
   import pycuda.gpuarray as gpuarray
+  import pycuda.driver as cuda
+
+  cuda.init()
+  assert cuda.Device.count() >= 1
+
+  dev = cuda.Device(0)
+  ctx = dev.make_context()
+  
   a_gpu = gpuarray.to_gpu(numpy.random.randn(4,4).astype(numpy.float32))
   a_doubled = (2*a_gpu).get()
   print a_doubled
   print a_gpu
 
+
 Where to Go from Here
 ---------------------
 
 Once you feel sufficiently familiar with the basics, feel free to dig into the
 :ref:`reference-doc`. Also check out PyCuda's test suite at
-:file:`test/test_driver.py`. It contains examples (and tests!) of many more
+:file:`test`. It contains examples (and tests!) of many more
 advanced techniques.
+
+More examples
+-------------
+
+In case you want to see more examples, you can find them in the :file:`example` folder.
+This folder also contains several benchmarks to see the difference between gpu and
+cpu based calculations.
diff --git a/test/demo.py → examples/demo.py b/test/demo.py → examples/demo.py
diff --git a/test/dump_properties.py → examples/dump_properties.py b/test/dump_properties.py → examples/dump_properties.py
diff --git a/test/hello_gpu.py → examples/hello_gpu.py b/test/hello_gpu.py → examples/hello_gpu.py
diff --git a/examples/plot_random_data.py b/examples/plot_random_data.py
@@ -0,0 +1,23 @@
+#simple module to show the ploting of random data
+
+import pycuda.gpuarray as cuda
+from matplotlib.pylab import *
+
+size = 1000
+
+#random data generated on gpu
+a = cuda.array(size).randn()
+
+
+subplot(211)
+plot(a)
+grid(True)
+ylabel('plot - gpu')
+
+subplot(212)
+hist(a, 100)
+grid(True)
+ylabel('histogram - gpu')
+
+#and save it
+savefig('plot-random-data')
diff --git a/test/test_gpuarray_speed.py → examples/test_gpuarray_speed.py b/test/test_gpuarray_speed.py → examples/test_gpuarray_speed.py
@@ -53,7 +53,7 @@ def main():
         secs = start.time_till(end)*1e-3
 
         times.append(secs/count)
-        flops.append(size*4)
+        flops.append(size)
 
         #cpu operations which adds two arrays
         aCpu = numpy.random.randn(size).astype(numpy.float32)
@@ -77,33 +77,21 @@ def main():
 
         #add results to variable
         timesCPU.append(secs/count)
-        flopsCPU.append(size*4)
+        flopsCPU.append(size)
 
 
     #calculate pseudo flops
     flops = [f/t for f, t in zip(flops,times)]
     flopsCPU = [f/t for f, t in zip(flopsCPU,timesCPU)]
 
     #print the data out
-    try:
-        from matplotlib.pylab import semilogx, show, title
-    except ImportError:
-        from pytools import Table
-        tbl = Table()
-        tbl.add_row(("Size", "Time GPU", "Giga Flops GPU", "Time CPU","Giga Flops CPU"))
-        for s, t, f,tCpu,fCpu in zip(sizes, times, flops,timesCPU,flopsCPU):
-            tbl.add_row((s,t,f/1000000000,tCpu,fCpu/1000000000))
-        print tbl
-    else:
-        title("time to add two vectors")
-        semilogx(sizes, times)
-        show()
-        title("flops")
-        semilogx(sizes, flops)
-        show()
-
-
 
+    from pytools import Table
+    tbl = Table()
+    tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup"))
+    for s, t, f,tCpu,fCpu in zip(sizes, times, flops,timesCPU,flopsCPU):
+        tbl.add_row((s,t,f,tCpu,fCpu,f/fCpu))
+    print tbl
 
 
 

diff --git a/examples/test_gpuarray_speed_random.py b/examples/test_gpuarray_speed_random.py
@@ -0,0 +1,94 @@
+#! /usr/bin/env python
+import pycuda.driver as drv
+import numpy
+import numpy.linalg as la
+from pytools import Table
+
+
+
+
+def main():
+    drv.init()
+    assert drv.Device.count() >= 1
+    ctx = drv.Device(0).make_context()
+
+    import pycuda.gpuarray as gpuarray
+
+    # make sure all the kernels are compiled
+    gpuarray.GPUArray.compile_kernels()
+    print "done compiling"
+
+    sizes = []
+    times = []
+    flops = []
+    flopsCPU = []
+    timesCPU = []
+
+    for power in range(10, 25): # 24
+        size = 1<<power
+        print size
+        sizes.append(size)
+        a = gpuarray.zeros((size,), dtype=numpy.float32)
+
+        if power > 20:
+            count = 100
+        else:
+            count = 1000
+
+        #start timer
+        start = drv.Event()
+        end = drv.Event()
+        start.record()
+
+        #cuda operation which fills the array with random numbers
+        for i in range(count):
+            a.randn()
+
+        #stop timer
+        end.record()
+        end.synchronize()
+
+        #calculate used time
+        secs = start.time_till(end)*1e-3
+
+        times.append(secs/count)
+        flops.append(size)
+
+        #cpu operations which fills teh array with random data
+        a = numpy.array((size,), dtype=numpy.float32)
+
+        #start timer
+        start = drv.Event()
+        end = drv.Event()
+        start.record()
+
+        #cpu operation which fills the array with random data        
+        for i in range(count):
+            numpy.random.randn(size).astype(numpy.float32)
+
+        #stop timer
+        end.record()
+        end.synchronize()
+
+        #calculate used time
+        secs = start.time_till(end)*1e-3
+
+        #add results to variable
+        timesCPU.append(secs/count)
+        flopsCPU.append(size)
+
+
+    #calculate pseudo flops
+    flops = [f/t for f, t in zip(flops,times)]
+    flopsCPU = [f/t for f, t in zip(flopsCPU,timesCPU)]
+
+    #print the data out
+    tbl = Table()
+    tbl.add_row(("Size", "Time GPU", "Size/Time GPU", "Time CPU","Size/Time CPU","GPU vs CPU speedup"))
+    for s, t, f,tCpu,fCpu in zip(sizes, times, flops,timesCPU,flopsCPU):
+        tbl.add_row((s,t,f,tCpu,fCpu,f/fCpu))
+    print tbl
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/test_math_speed.py b/examples/test_math_speed.py
@@ -0,0 +1,144 @@
+#! /usr/bin/env python
+import pycuda.cumath as cuma
+import pycuda._kernel as kernel
+import pycuda.gpuarray as cuda
+import pycuda.driver as drv
+import types
+import numpy as numpy
+from pytools import Table
+
+runs = 10
+
+drv.init()
+assert drv.Device.count() >= 1
+ctx = drv.Device(0).make_context()
+
+
+def time_cpu_execution(size,method,argumentCount):
+    """times the execution time on the cpu"""
+
+    start = drv.Event()
+    end = drv.Event()
+    start.record()
+
+    a = numpy.zeros(size,numpy.float32)+1
+
+    for x in range(runs):    
+        for i in range(size):
+            if argumentCount == 1:
+                method(a[i])
+            if argumentCount == 2:
+                method(a[i],2)
+
+    #stop timer
+    end.record()
+    end.synchronize()
+
+    #calculate used time
+    secs = start.time_till(end)
+
+    return secs
+
+def time_gpu_execution(size,method,argumentCount):
+    """times the execution time on the gpu"""
+    start = drv.Event()
+    end = drv.Event()
+    start.record()
+
+    a = cuda.array(size)+1
+
+    for x in range(runs):
+        if argumentCount == 1:
+            method(a)
+        if argumentCount == 2:
+            method(a,2)
+
+    #stop timer
+    end.record()
+    end.synchronize()
+
+    #calculate used time
+    secs = start.time_till(end)
+
+    return secs
+
+#iterate over all methods and time the execution time with different array sizes
+print "compile kernels"
+kernel._compile_kernels(kernel)
+
+#generate our output table, one for gpu, one for cpu
+tblCPU = Table()
+tblGPU = Table()
+tblSPD = Table()
+
+#contains all the method names
+methods = ["size"]
+
+for name in dir(cuma):
+    if (name.startswith("__") and name.endswith("__")) == False:
+        method = getattr(cuma, name)
+
+        if type(method) == types.FunctionType:
+            methods.append(name)
+
+tblCPU.add_row(methods)
+tblGPU.add_row(methods)
+tblSPD.add_row(methods)
+
+#generate arrays with differnt sizes
+for power in range(1,20):
+    size = 1<<power
+
+    #temp variables
+    rowCPU = [size]
+    rowGPU = [size]
+    rowSPD = [size]
+
+    print "calculating: ", size
+
+    for name in dir(cuma):
+        if (name.startswith("__") and name.endswith("__")) == False:
+
+            method = getattr(cuma, name)
+
+            if type(method) == types.FunctionType:
+                code = method.func_code
+                argCount = code.co_argcount
+
+                gpu_time = time_gpu_execution(size,method,argCount)
+                cpu_time = time_cpu_execution(size,method,argCount)
+
+                rowCPU.append(str(cpu_time/runs)[0:7])
+                rowGPU.append(str(gpu_time/runs)[0:7])
+
+                speed_cpu = size/(cpu_time/runs)
+                speed_gpu = size/(gpu_time/runs)
+                rowSPD.append(str(speed_gpu/speed_cpu)[0:7])
+
+    tblCPU.add_row(rowCPU)
+    tblGPU.add_row(rowGPU)
+    tblSPD.add_row(rowSPD)
+
+print ""
+
+print "GPU Times (ms)"
+
+print ""
+
+print tblGPU
+
+print ""
+
+print "CPU Times (ms)"
+
+print ""
+print tblCPU
+
+
+print ""
+
+print "GPU VS CPU"
+
+print ""
+print tblSPD
+