Skip to content

Commit

Permalink
prefetch scale/bias (pytorch#322)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#322

When block_size is a multiple of cache line size, we also need to prefetch for scale and bias.
Thanks Wu Yong and Dhiraj Kalamkar for this suggestion!

Other minor fixes:
* Remove extra prefetches of 2/4-bit cases
* Fix a bug cache flushed case performance was not printed from EmbeddingSpMDM8BitBenchmark

In Xeon Gold 6138 single core run with numactl -m 0 -C 3
Improvements for cache flushed prefetch on cases

EmbeddingSpMDM8BitBenchmark
Before this diff
```
batch size    10  num rows         4000000   emb dim    64      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   9.90465 GB/s     effective b/w:           16.834GB/s   time       7.2572e-06 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w   1.54901 GB/s     effective b/w:          2.63272GB/s   time      4.64037e-05 load_imbalance 0
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   12.5395 GB/s     effective b/w:          21.3122GB/s   time       5.7323e-06 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w    1.7886 GB/s     effective b/w:          3.03993GB/s   time      4.01878e-05 load_imbalance 0
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   9.90054 GB/s     effective b/w:          17.1914GB/s   time       6.8974e-06 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w   1.45208 GB/s     effective b/w:           2.5214GB/s   time      4.70278e-05 load_imbalance 0
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   12.1912 GB/s     effective b/w:           21.169GB/s   time       5.6014e-06 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w   1.75153 GB/s     effective b/w:          3.04138GB/s   time      3.89876e-05 load_imbalance 0
batch size    10  num rows         4000000   emb dim   128      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w    10.603 GB/s     effective b/w:          14.7251GB/s   time      1.21996e-05 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w   1.60321 GB/s     effective b/w:          2.22649GB/s   time      8.06831e-05 load_imbalance 0
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   13.4293 GB/s     effective b/w:          18.6501GB/s   time       9.6321e-06 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w   2.31425 GB/s     effective b/w:          3.21396GB/s   time      5.58936e-05 load_imbalance 0
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   10.6253 GB/s     effective b/w:          14.8741GB/s   time      1.18359e-05 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w   1.58324 GB/s     effective b/w:          2.21633GB/s   time      7.94322e-05 load_imbalance 0
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   13.5941 GB/s     effective b/w:            19.03GB/s   time       9.2511e-06 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w   2.27687 GB/s     effective b/w:          3.18733GB/s   time      5.52337e-05 load_imbalance 0
batch size    10  num rows         4000000   emb dim   256      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   10.2647 GB/s     effective b/w:          12.3777GB/s   time      2.37996e-05 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w   1.36054 GB/s     effective b/w:           1.6406GB/s   time      0.000179558 load_imbalance 0
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   14.2034 GB/s     effective b/w:          17.1272GB/s   time      1.71998e-05 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w   2.75221 GB/s     effective b/w:          3.31874GB/s   time      8.87637e-05 load_imbalance 0
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w    11.114 GB/s     effective b/w:           13.436GB/s   time      2.16577e-05 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w    1.3533 GB/s     effective b/w:          1.63603GB/s   time      0.000177864 load_imbalance 0
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   13.7927 GB/s     effective b/w:          16.6742GB/s   time      1.74516e-05 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w   2.84246 GB/s     effective b/w:           3.4363GB/s   time      8.46817e-05 load_imbalance 0
```
After this diff
```
batch size    10  num rows         4000000   emb dim    64      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   10.6528 GB/s     effective b/w:          18.1057GB/s   time       6.7475e-06 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w   1.51285 GB/s     effective b/w:          2.57126GB/s   time      4.75129e-05 load_imbalance 0
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   11.4795 GB/s     effective b/w:          19.5107GB/s   time       6.2616e-06 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w   2.09112 GB/s     effective b/w:          3.55409GB/s   time      3.43739e-05 load_imbalance 0
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w    10.087 GB/s     effective b/w:          17.5152GB/s   time       6.7699e-06 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w   1.43725 GB/s     effective b/w:          2.49565GB/s   time       4.7513e-05 load_imbalance 0
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   12.5297 GB/s     effective b/w:          21.7567GB/s   time       5.4501e-06 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w   2.10461 GB/s     effective b/w:          3.65447GB/s   time      3.24468e-05 load_imbalance 0
batch size    10  num rows         4000000   emb dim   128      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   10.4409 GB/s     effective b/w:             14.5GB/s   time       1.2389e-05 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w   1.57848 GB/s     effective b/w:          2.19214GB/s   time      8.19472e-05 load_imbalance 0
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   13.6041 GB/s     effective b/w:           18.893GB/s   time       9.5083e-06 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w     2.769 GB/s     effective b/w:           3.8455GB/s   time      4.67144e-05 load_imbalance 0
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w    9.9698 GB/s     effective b/w:          13.9564GB/s   time      1.26141e-05 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w    1.5171 GB/s     effective b/w:          2.12375GB/s   time      8.28949e-05 load_imbalance 0
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   14.1504 GB/s     effective b/w:          19.8087GB/s   time       8.8874e-06 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w   2.73169 GB/s     effective b/w:          3.82401GB/s   time      4.60375e-05 load_imbalance 0
batch size    10  num rows         4000000   emb dim   256      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   10.2009 GB/s     effective b/w:          12.3008GB/s   time      2.39484e-05 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w   1.40221 GB/s     effective b/w:          1.69085GB/s   time      0.000174223 load_imbalance 0
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   15.0724 GB/s     effective b/w:           18.175GB/s   time      1.62082e-05 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w   3.07866 GB/s     effective b/w:           3.7124GB/s   time      7.93513e-05 load_imbalance 0
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   10.4975 GB/s     effective b/w:          12.6906GB/s   time      2.29297e-05 load_imbalance 0
            SLS        cache flushed    prefetch off     b/w   1.35172 GB/s     effective b/w:          1.63412GB/s   time      0.000178073 load_imbalance 0
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   14.8721 GB/s     effective b/w:          17.9792GB/s   time      1.61849e-05 load_imbalance 0
            SLS        cache flushed     prefetch on     b/w   3.08183 GB/s     effective b/w:          3.72568GB/s   time      7.81043e-05 load_imbalance 0
```

EmbeddingSpMDMNBitBenchmark
Before this diff
```
bit_rate     2batch size    10  num rows         4000000   emb dim   256      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   1.80376 GB/s     effective b/w:          3.39532GB/s   time      3.38537e-05
            SLS        cache flushed    prefetch off     b/w  0.424192 GB/s     effective b/w:         0.798479GB/s   time      0.000143954
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   1.99412 GB/s     effective b/w:          3.75363GB/s   time      3.06221e-05
            SLS        cache flushed     prefetch on     b/w  0.816262 GB/s     effective b/w:          1.53649GB/s   time      7.48093e-05
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   1.83764 GB/s     effective b/w:           3.4591GB/s   time      3.32295e-05
            SLS        cache flushed    prefetch off     b/w  0.419017 GB/s     effective b/w:         0.788737GB/s   time      0.000145732
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   1.97789 GB/s     effective b/w:          3.72309GB/s   time      3.08733e-05
            SLS        cache flushed     prefetch on     b/w  0.808449 GB/s     effective b/w:          1.52179GB/s   time      7.55323e-05
bit_rate     4batch size    10  num rows         4000000   emb dim   128      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   3.60092 GB/s     effective b/w:           6.7782GB/s   time      1.69579e-05
            SLS        cache flushed    prefetch off     b/w  0.718051 GB/s     effective b/w:          1.35163GB/s   time      8.50413e-05
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   4.16032 GB/s     effective b/w:           7.8312GB/s   time      1.46777e-05
            SLS        cache flushed     prefetch on     b/w   1.04765 GB/s     effective b/w:          1.97205GB/s   time      5.82865e-05
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   3.57939 GB/s     effective b/w:          6.73767GB/s   time      1.70599e-05
            SLS        cache flushed    prefetch off     b/w  0.716159 GB/s     effective b/w:          1.34806GB/s   time       8.5266e-05
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   4.14479 GB/s     effective b/w:          7.80196GB/s   time      1.47327e-05
            SLS        cache flushed     prefetch on     b/w   1.01756 GB/s     effective b/w:          1.91541GB/s   time      6.00102e-05
bit_rate     4batch size    10  num rows         4000000   emb dim   256      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   3.84941 GB/s     effective b/w:          5.59914GB/s   time      3.07933e-05
            SLS        cache flushed    prefetch off     b/w  0.806039 GB/s     effective b/w:          1.17242GB/s   time       0.00014706
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   4.48789 GB/s     effective b/w:          6.52784GB/s   time      2.64124e-05
            SLS        cache flushed     prefetch on     b/w   1.50334 GB/s     effective b/w:          2.18668GB/s   time      7.88482e-05
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   3.80736 GB/s     effective b/w:          5.53798GB/s   time      3.11334e-05
            SLS        cache flushed    prefetch off     b/w  0.804521 GB/s     effective b/w:          1.17021GB/s   time      0.000147337
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   4.50978 GB/s     effective b/w:          6.55968GB/s   time      2.62842e-05
            SLS        cache flushed     prefetch on     b/w   1.46049 GB/s     effective b/w:          2.12435GB/s   time      8.11616e-05
```
After this diff
```
bit_rate     2batch size    10  num rows         4000000   emb dim   256      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   1.77694 GB/s     effective b/w:          3.34483GB/s   time      3.43647e-05
            SLS        cache flushed    prefetch off     b/w  0.415345 GB/s     effective b/w:         0.781826GB/s   time       0.00014702
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   1.93419 GB/s     effective b/w:          3.64082GB/s   time      3.15709e-05
            SLS        cache flushed     prefetch on     b/w  0.923895 GB/s     effective b/w:           1.7391GB/s   time      6.60941e-05
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   1.81709 GB/s     effective b/w:           3.4204GB/s   time      3.36054e-05
            SLS        cache flushed    prefetch off     b/w  0.410926 GB/s     effective b/w:         0.773508GB/s   time      0.000148601
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   1.99784 GB/s     effective b/w:          3.76064GB/s   time       3.0565e-05
            SLS        cache flushed     prefetch on     b/w  0.969064 GB/s     effective b/w:          1.82412GB/s   time      6.30134e-05
bit_rate     4batch size    10  num rows         4000000   emb dim   128      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   3.33873 GB/s     effective b/w:          6.28466GB/s   time      1.82896e-05
            SLS        cache flushed    prefetch off     b/w  0.712447 GB/s     effective b/w:          1.34108GB/s   time      8.57102e-05
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   4.28977 GB/s     effective b/w:          8.07486GB/s   time      1.42348e-05
            SLS        cache flushed     prefetch on     b/w   1.29603 GB/s     effective b/w:          2.43958GB/s   time      4.71163e-05
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   3.56127 GB/s     effective b/w:          6.70356GB/s   time      1.71467e-05
            SLS        cache flushed    prefetch off     b/w  0.720089 GB/s     effective b/w:          1.35546GB/s   time      8.48006e-05
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   4.27991 GB/s     effective b/w:           8.0563GB/s   time      1.42676e-05
            SLS        cache flushed     prefetch on     b/w   1.29738 GB/s     effective b/w:          2.44212GB/s   time      4.70673e-05
bit_rate     4batch size    10  num rows         4000000   emb dim   256      avg length   100
64 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   3.65701 GB/s     effective b/w:          5.31928GB/s   time      3.24134e-05
            SLS        cache flushed    prefetch off     b/w  0.745156 GB/s     effective b/w:          1.08386GB/s   time      0.000159075
64 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w    4.5061 GB/s     effective b/w:          6.55432GB/s   time      2.63057e-05
            SLS        cache flushed     prefetch on     b/w   1.81897 GB/s     effective b/w:          2.64578GB/s   time      6.51665e-05
32 bit indices, lengths_sum 898
            SLS    cache not flushed    prefetch off     b/w   3.87403 GB/s     effective b/w:          5.63495GB/s   time      3.05976e-05
            SLS        cache flushed    prefetch off     b/w  0.738564 GB/s     effective b/w:          1.07428GB/s   time      0.000160495
32 bit indices with prefetching, lengths_sum 898
            SLS    cache not flushed     prefetch on     b/w   4.64235 GB/s     effective b/w:          6.75251GB/s   time      2.55336e-05
            SLS        cache flushed     prefetch on     b/w   1.84529 GB/s     effective b/w:          2.68406GB/s   time       6.4237e-05
```

Reviewed By: shz0116

Differential Revision: D20582936

fbshipit-source-id: b97723384bdee74507b8a25167db6d90ab742342
  • Loading branch information
jspark1105 authored and facebook-github-bot committed Mar 22, 2020
1 parent fff5d32 commit d7c4a34
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 40 deletions.
69 changes: 34 additions & 35 deletions bench/EmbeddingSpMDM8BitBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,11 @@ using namespace fbgemm;

void print_fused_table(int rows, int embedding_dim, const uint8_t* table) {
for (int i = 0; i < rows; i++) {
std::cout << "row: " << i << " : " << std::endl;
cout << "row: " << i << " : " << endl;
for (int ii = 0; ii < embedding_dim; ii++) {
std::cout << (int)table[i * (embedding_dim + 2 * sizeof(float)) + ii]
<< ",";
cout << (int)table[i * (embedding_dim + 2 * sizeof(float)) + ii] << ",";
}
std::cout << std::endl;
cout << endl;
}
}

Expand Down Expand Up @@ -259,42 +258,42 @@ int run_benchmark(
}
}
}
}

#ifdef _OPENMP
#pragma omp barrier
#endif
if (fbgemm_get_thread_num() == 0) {
if (has_weight) {
cout << setw(16) << "SLW(WEIGHTED) ";
} else {
cout << setw(16) << "SLS ";
}
if (flush_cache) {
cout << setw(20) << "cache flushed";
} else {
cout << setw(20) << "cache not flushed";
}
if (prefetch) {
cout << setw(16) << "prefetch on";
} else {
cout << setw(16) << "prefetch off";
}

double max_time = *std::max_element(
times.begin(), times.begin() + fbgemm_get_num_threads());
double avg_time = std::accumulate(
times.begin(),
times.begin() + fbgemm_get_num_threads(),
0.0) /
fbgemm_get_num_threads();
double load_imbalance = (max_time - avg_time) / avg_time;

cout << setw(8) << "b/w" << setw(10) << bytes / 1e9 / max_time
<< " GB/s" << setw(20) << "effective b/w: " << setw(16)
<< bytes_padded / 1e9 / max_time << "GB/s" << setw(8) << " time "
<< setw(16) << max_time << " load_imbalance " << load_imbalance
<< endl;
if (fbgemm_get_thread_num() == 0) {
if (has_weight) {
cout << setw(16) << "SLW(WEIGHTED) ";
} else {
cout << setw(16) << "SLS ";
}
if (flush_cache) {
cout << setw(20) << "cache flushed";
} else {
cout << setw(20) << "cache not flushed";
}
if (prefetch) {
cout << setw(16) << "prefetch on";
} else {
cout << setw(16) << "prefetch off";
}

double max_time = *std::max_element(
times.begin(), times.begin() + fbgemm_get_num_threads());
double avg_time = std::accumulate(
times.begin(),
times.begin() + fbgemm_get_num_threads(),
0.0) /
fbgemm_get_num_threads();
double load_imbalance = (max_time - avg_time) / avg_time;

cout << setw(8) << "b/w" << setw(10) << bytes / 1e9 / max_time
<< " GB/s" << setw(20) << "effective b/w: " << setw(16)
<< bytes_padded / 1e9 / max_time << "GB/s" << setw(8) << " time "
<< setw(16) << max_time << " load_imbalance " << load_imbalance
<< endl;
}
} // flush_cache
} // has_weight
Expand Down
11 changes: 10 additions & 1 deletion src/EmbeddingSpMDM.cc
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,7 @@ typename ReturnFunctionSignature<inType, indxType, ROWWISE_SPARSE>::

// broadcast the scale
x86::Mem scale_src, bias_src;
constexpr int CACHE_LINE_LEN = 64;
if (is8bit) {
scale_src = x86::dword_ptr(
input, scratchReg1_, 0, block_size * sizeof(uint8_t));
Expand All @@ -539,6 +540,15 @@ typename ReturnFunctionSignature<inType, indxType, ROWWISE_SPARSE>::
block_size * sizeof(uint8_t) + sizeof(float));
a->vbroadcastss(scale_vreg, scale_src);
a->vbroadcastss(bias_vreg, bias_src);

if (pref_dist &&
fused_block_size % CACHE_LINE_LEN <= 2 * sizeof(float)) {
a->prefetcht0(x86::dword_ptr(
input,
scratchReg2_,
0,
fused_block_size / CACHE_LINE_LEN * CACHE_LINE_LEN));
}
}

if (has_weight && is8bit) {
Expand Down Expand Up @@ -648,7 +658,6 @@ typename ReturnFunctionSignature<inType, indxType, ROWWISE_SPARSE>::
}
}

constexpr int CACHE_LINE_LEN = 64;
constexpr int VLOAD_PER_CACHE_LINE =
CACHE_LINE_LEN / BYTES_PER_VLOAD;
if (pref_dist && (vec_idx + v) % VLOAD_PER_CACHE_LINE == 0) {
Expand Down
15 changes: 11 additions & 4 deletions src/EmbeddingSpMDMNBit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ GenEmbeddingSpMDMNBitLookup<indxType, ROWWISE_SPARSE>::getOrCreate(
x86::Assembler assembler(&code);
x86::Emitter* a = assembler.as<x86::Emitter>();
#if defined(FBGEMM_LOG_CODE)
string filename = "embeddinglookup_" + to_string(bit_rate) + "bit_";
string filename = "embeddinglookup_" + to_string(bit_rate) + "bit";
filename += "_emd_dim_" + to_string(block_size);
filename += areIndices64b ? "_64bit" : "_32bit";
filename += instSet == inst_set_t::avx512 ? "_avx512" : "_avx2";
Expand Down Expand Up @@ -584,6 +584,15 @@ GenEmbeddingSpMDMNBitLookup<indxType, ROWWISE_SPARSE>::getOrCreate(
vec_reg_t(scale_vreg.id()), half_vec_reg_t(scale_vreg.id()));
a->vcvtph2ps(
vec_reg_t(bias_vreg.id()), half_vec_reg_t(bias_vreg.id()));
constexpr int CACHE_LINE_LEN = 64;
if (pref_dist &&
fused_block_size % CACHE_LINE_LEN <= 2 * sizeof(float16)) {
a->prefetcht0(x86::dword_ptr(
input,
scratchReg2_,
0,
fused_block_size / CACHE_LINE_LEN * CACHE_LINE_LEN));
}

if (has_weight) {
a->vmulps(scale_vreg, scale_vreg, w_vreg);
Expand All @@ -599,7 +608,6 @@ GenEmbeddingSpMDMNBitLookup<indxType, ROWWISE_SPARSE>::getOrCreate(
// (epu8->epi32), and then get 4 zmms from each 128-bit portion of
// zmm via vpmovsxbd (epi8->epi32).
for (int v = 0; v < cur_unroll_factor; v += 4) {
// Divide by 2 because we're doing ymm load rather than zmm
int bytes_per_vload = (vlen / num_elem_per_byte) * sizeof(uint8_t);
auto src_addr = x86::dword_ptr(
input, scratchReg1_, 0, (vec_idx + v) * bytes_per_vload);
Expand Down Expand Up @@ -719,10 +727,9 @@ GenEmbeddingSpMDMNBitLookup<indxType, ROWWISE_SPARSE>::getOrCreate(
a->vfmadd231ps(out_vreg, temp_vreg, scale_vreg);
} // for each i

constexpr int CACHE_LINE_LEN = 64;
int vload_per_cache_line = CACHE_LINE_LEN / bytes_per_vload;
int v_aligned = ceil_div(vec_idx + v, 4) * 4;
if (pref_dist && v_aligned * 4 % vload_per_cache_line == 0) {
if (pref_dist && v_aligned % vload_per_cache_line == 0) {
a->prefetcht0(x86::dword_ptr(
input, scratchReg2_, 0, v_aligned * bytes_per_vload));
}
Expand Down

0 comments on commit d7c4a34

Please sign in to comment.