Entire matrix A is owned by one Worker_A

Computations are done in other workers and sent back to Worker_A hpl_par.j still does not have spawns, but this is preparing to get there Code works correctly
Cthulhoni · Jun 20, 2011 · f627d98 · f627d98
1 parent f9bad94
commit f627d98
Showing 2 changed files with 54 additions and 47 deletions.
diff --git a/test/bench/hpl_par.j b/test/bench/hpl_par.j
@@ -1,7 +1,11 @@
 ## Based on "Multi-Threading and One-Sided Communication in Parallel LU Factorization"
 ## http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.138.4361&rank=7
 
-function hpl (A::Matrix, b::Vector)
+# This version is written for a shared memory implementation.
+# The matrix A is local to the first Worker, which allocates work to other Workers
+# All updates to A are carried out by the first Worker. Thus A is not distributed
+
+function hpl_par(A::Matrix, b::Vector)
 
     blocksize = 5
 
@@ -12,23 +16,53 @@ function hpl (A::Matrix, b::Vector)
     B_rows[end] = n 
     B_cols = [B_rows, [n+1]]
     nB = length(B_rows)
-    #depend = zeros(Bool, nB, nB) # In parallel, depend needs to be able to hold futures
-    depend = Array(RemoteRef, nB, nB)
+    depend = zeros(Bool, nB, nB) # In parallel, depend needs to be able to hold futures
+
+    ## Small matrix case
+    if nB <= 1
+        x = A[1:n, 1:n] \ A[:,n+1]
+        return x
+    end
 
     ## Add a ghost row of dependencies to boostrap the computation
     for j=1:nB; depend[1,j] = true; end
 
     for i=1:(nB-1)
         ## Threads for panel factorizations
-        K = (B_rows[i]+1):n
         I = (B_rows[i]+1):B_rows[i+1]
-        #(depend[i+1,i], panel_p) = panel_factor(A, I, depend[i,i])        
-        rref_pf = @spawn panel_factor(A[K,I], depend[i,i]) 
-        depend[i+1,i] = @spawnlocal spawn_trailing_updates ()
+        K = I[1]:n
+        (A_KI, panel_p) = panel_factor(A[K,I], depend[i,i])
+
+        ## Write the factorized panel back to A
+        A[K,I] = A_KI
+
+        ## Panel permutation 
+        panel_p = K[panel_p]
+        depend[i+1,i] = true
+
+        ## Apply permutation from pivoting
+        J = (B_cols[i+1]+1):B_cols[nB+1]
+        A[K, J] = A[panel_p, J]
+
+        ## Threads for trailing updates
+        L_II = tril(A[I,I], -1) + eye(length(I))
+        K = (I[end]+1):n
+
+        for j=(i+1):nB
+            J = (B_cols[j]+1):B_cols[j+1]
+
+            ## Do the trailing update (Compute U, and DGEMM - all flops are here)
+            (A_IJ, A_KJ) = trailing_update(L_II, A[I,J], A[K,I], A[K,J], 
+                                           depend[i+1,i], depend[i,j])
+            ## Write updates back to A
+            A[I,J] = A_IJ
+            A[K,J] = A_KJ
+            depend[i+1,j] = true
+        end
     end
 
     ## Completion of the last diagonal block signals termination
-    #wait(depend[nB, nB])
+    @assert depend[nB, nB]
 
     ## Solve the triangular system
     x = triu(A[1:n,1:n]) \ A[:,n+1]
@@ -40,53 +74,26 @@ end ## hpl()
 
 ### Panel factorization ###
 
-function panel_factor(A, col_dep)
-
-    ## Enforce dependencies
-    wait(col_dep)
-
+function panel_factor(A_KI, col_dep)
+
     ## Factorize a panel
-    panel_p = lu(A, true) # Economy mode. A is overwritten with factors.
+    (A_KI, panel_p) = lu(A_KI, true) # Economy mode
 
-    return (A, panel_p)
+    return (A_KI, panel_p)
 
 end ## panel_factor()
 
-### spawn trailing updates ###
 
-function spawn_trailing_updates(A, depend, rref_pf)
+### Trailing update ###
 
-    (A[K,I], p) = fetch(rref_pf)
-    panel_p = K[p]
-
-    for j=(i+1):nB
-        J = (B_cols[j]+1):B_cols[j+1]
-        #depend[i+1,j] = trailing_update(A, I, J, panel_p, depend[i+1,i],depend[i,j])
-        rref_tu = @spawn trailing_update(A, I, J, panel_p, depend[i+1,i],depend[i,j])
-        depend[i+1,j] = @spawnlocal (A[I,J], A[K,J]) = fetch(rref_tu)
-    end
-
-end
-
-### One block column trailing update ###
-
-function trailing_update(A_II, A_IJ, A_KI, A_IJ, panel_p, row_dep, col_dep)
-
-    ## Enforce dependencies
-    wait(row_dep)
-    wait(col_dep)
-
-    ## Apply permutation from pivoting 
-    K = (I[end]+1):n
-    A[I[1]:n, J] = A[panel_p, J]
+function trailing_update(L_II, A_IJ, A_KI, A_KJ, row_dep, col_dep)
 
-    ## Apply pivot sequence to Compute blocks of U 
-    L = tril(A_II, -1) + eye(length(I))
-    A_IJ = L \ A_IJ 
+    ## Compute blocks of U 
+    A_IJ = L_II \ A_IJ
 
-    ## Trailing submatrix update
-    if !isempty(K)
-        A_KJ = A_KJ - A_KI*A_IJ  
+    ## Trailing submatrix update - All flops are here
+    if !isempty(A_KJ)
+        A_KJ = A_KJ - A_KI*A_IJ
     end
 
     return (A_IJ, A_KJ)

diff --git a/test/bench/hpl_seq.j b/test/bench/hpl_seq.j
@@ -1,7 +1,7 @@
 ## Based on "Multi-Threading and One-Sided Communication in Parallel LU Factorization"
 ## http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.138.4361&rank=7
 
-function hpl (A::Matrix, b::Vector)
+function hpl_seq(A::Matrix, b::Vector)
 
     blocksize = 5