forked from dmlc/dgl
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[DGL-KE] Distributed training of DGL-KE (dmlc#1290)
* update * change name * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * change worker number * update * update * update * update * update * update * test * update * update * update * remove barrier * max_step * update * add complex * update * chmod +x * update * update * random partition * random partition * update * update * update * update * update * update * update * update * update * update * update * change num_test_proc * update num_thread * update
- Loading branch information
Showing
13 changed files
with
872 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
################################################################################## | ||
# This script runing ComplEx model on Freebase dataset in distributed setting. | ||
# You can change the hyper-parameter in this file but DO NOT run script manually | ||
################################################################################## | ||
machine_id=$1 | ||
server_count=$2 | ||
|
||
# Delete the temp file | ||
rm *-shape | ||
|
||
################################################################################## | ||
# Start kvserver | ||
################################################################################## | ||
SERVER_ID_LOW=$((machine_id*server_count)) | ||
SERVER_ID_HIGH=$(((machine_id+1)*server_count)) | ||
|
||
while [ $SERVER_ID_LOW -lt $SERVER_ID_HIGH ] | ||
do | ||
MKL_NUM_THREADS=1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 ../kvserver.py --model ComplEx --dataset Freebase \ | ||
--hidden_dim 400 --gamma 143.0 --lr 0.1 --total_client 160 --server_id $SERVER_ID_LOW & | ||
let SERVER_ID_LOW+=1 | ||
done | ||
|
||
################################################################################## | ||
# Start kvclient | ||
################################################################################## | ||
MKL_NUM_THREADS=1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 ../kvclient.py --model ComplEx --dataset Freebase \ | ||
--batch_size 1024 --neg_sample_size 256 --hidden_dim 400 --gamma 143.0 --lr 0.1 --max_step 12500 --log_interval 100 \ | ||
--batch_size_eval 1000 --neg_sample_size_test 1000 --test -adv --total_machine 4 --num_client 40 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
################################################################################## | ||
# This script runing distmult model on Freebase dataset in distributed setting. | ||
# You can change the hyper-parameter in this file but DO NOT run script manually | ||
################################################################################## | ||
machine_id=$1 | ||
server_count=$2 | ||
|
||
# Delete the temp file | ||
rm *-shape | ||
|
||
################################################################################## | ||
# Start kvserver | ||
################################################################################## | ||
SERVER_ID_LOW=$((machine_id*server_count)) | ||
SERVER_ID_HIGH=$(((machine_id+1)*server_count)) | ||
|
||
while [ $SERVER_ID_LOW -lt $SERVER_ID_HIGH ] | ||
do | ||
MKL_NUM_THREADS=1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 ../kvserver.py --model DistMult --dataset Freebase \ | ||
--hidden_dim 400 --gamma 143.0 --lr 0.08 --total_client 160 --server_id $SERVER_ID_LOW & | ||
let SERVER_ID_LOW+=1 | ||
done | ||
|
||
################################################################################## | ||
# Start kvclient | ||
################################################################################## | ||
MKL_NUM_THREADS=1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 ../kvclient.py --model DistMult --dataset Freebase \ | ||
--batch_size 1024 --neg_sample_size 256 --hidden_dim 400 --gamma 143.0 --lr 0.08 --max_step 12500 --log_interval 100 \ | ||
--batch_size_eval 1000 --neg_sample_size_test 1000 --test -adv --total_machine 4 --num_client 40 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
################################################################################## | ||
# This script runing distmult model on Freebase dataset in distributed setting. | ||
# You can change the hyper-parameter in this file but DO NOT run script manually | ||
################################################################################## | ||
machine_id=$1 | ||
server_count=$2 | ||
|
||
# Delete the temp file | ||
rm *-shape | ||
|
||
################################################################################## | ||
# Start kvserver | ||
################################################################################## | ||
SERVER_ID_LOW=$((machine_id*server_count)) | ||
SERVER_ID_HIGH=$(((machine_id+1)*server_count)) | ||
|
||
while [ $SERVER_ID_LOW -lt $SERVER_ID_HIGH ] | ||
do | ||
MKL_NUM_THREADS=1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 ../kvserver.py --model TransE_l2 --dataset Freebase \ | ||
--hidden_dim 400 --gamma 10 --lr 0.1 --total_client 160 --server_id $SERVER_ID_LOW & | ||
let SERVER_ID_LOW+=1 | ||
done | ||
|
||
################################################################################## | ||
# Start kvclient | ||
################################################################################## | ||
MKL_NUM_THREADS=1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 ../kvclient.py --model TransE_l2 --dataset Freebase \ | ||
--batch_size 1000 --neg_sample_size 200 --hidden_dim 400 --gamma 10 --lr 0.1 --max_step 12500 --log_interval 100 \ | ||
--batch_size_eval 1000 --neg_sample_size_test 1000 --test -adv --regularization_coef 1e-9 --total_machine 4 --num_client 40 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
127.0.0.1 30050 8 | ||
127.0.0.1 30050 8 | ||
127.0.0.1 30050 8 | ||
127.0.0.1 30050 8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
################################################################################## | ||
# User runs this script to launch distrobited jobs on cluster | ||
################################################################################## | ||
script_path=~/dgl/apps/kg/distributed | ||
script_file=./freebase_transe_l2.sh | ||
user_name=ubuntu | ||
ssh_key=~/mctt.pem | ||
|
||
server_count=$(awk 'NR==1 {print $3}' ip_config.txt) | ||
|
||
# run command on remote machine | ||
LINE_LOW=2 | ||
LINE_HIGH=$(awk 'END{print NR}' ip_config.txt) | ||
let LINE_HIGH+=1 | ||
s_id=0 | ||
while [ $LINE_LOW -lt $LINE_HIGH ] | ||
do | ||
ip=$(awk 'NR=='$LINE_LOW' {print $1}' ip_config.txt) | ||
let LINE_LOW+=1 | ||
let s_id+=1 | ||
ssh -i $ssh_key $user_name@$ip 'cd '$script_path'; '$script_file' '$s_id' '$server_count' ' & | ||
done | ||
|
||
# run command on local machine | ||
$script_file 0 $server_count |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.