@@ -249,7 +249,37 @@ void clampedExpVector(float* values, int* exponents, float* output, int N) {
249
249
// Your solution should work for any value of
250
250
// N and VECTOR_WIDTH, not just when VECTOR_WIDTH divides N
251
251
//
252
+ __cs149_vec_float res,reg_x;
253
+ __cs149_vec_int reg_exp,ONE=_cs149_vset_int (1 );
254
+ __cs149_mask selected,not_zero;
255
+ float TEN = 9 .999999f ;
256
+ __cs149_vec_float TRUNCATED= _cs149_vset_float (9 .999999f );
257
+ __cs149_vec_int ZERO = _cs149_vset_int (0 );
258
+ // const int MULTIPLE_BOUND = N - (N % VECTOR_WIDTH); // last index i % VECTOR_WIDTH=0
252
259
260
+ for (int i=0 ; i< N; i+= VECTOR_WIDTH){
261
+ int N_SELECT = std::min (i+VECTOR_WIDTH,N) - i;
262
+ selected = _cs149_init_ones (N_SELECT);
263
+ _cs149_vload_float (reg_x,values+i,selected); // x = val[i]
264
+ _cs149_vload_int (reg_exp,exponents+i,selected); // exp = exp[i]
265
+
266
+ // _cs149_veq_int(is_zero,reg_exp,ZERO,selected); // if exp[i] == 0
267
+ _cs149_vset_float (res,1 .0f ,selected); // res[i] = 1
268
+ // not_zero = _cs149_mask_not(is_zero); // else
269
+ not_zero = _cs149_init_ones ();
270
+ not_zero = _cs149_mask_not (not_zero);// not_zero = : 0...0
271
+ _cs149_vgt_int (not_zero,reg_exp,ZERO,selected);// only effect elements be selected
272
+ // _cs149_vmove_float(res,reg_x,not_zero);
273
+ while (_cs149_cntbits (not_zero) >0 )
274
+ {
275
+ _cs149_vmult_float (res,res,reg_x,not_zero);
276
+ _cs149_vsub_int (reg_exp,reg_exp,ONE,not_zero);
277
+ _cs149_vgt_int (not_zero,reg_exp,ZERO,not_zero);
278
+ }
279
+ _cs149_vgt_float (not_zero,res,TRUNCATED,selected);// if res[i] > 9.99999f
280
+ _cs149_vset_float (res,TEN,not_zero); // res[i] = 9.999999f
281
+ _cs149_vstore_float (output+i,res,selected);
282
+ }
253
283
}
254
284
255
285
// returns the sum of all elements in values
@@ -270,11 +300,32 @@ float arraySumVector(float* values, int N) {
270
300
//
271
301
// CS149 STUDENTS TODO: Implement your vectorized version of arraySumSerial here
272
302
//
273
-
274
- for (int i=0 ; i<N; i+=VECTOR_WIDTH) {
303
+ float ans =0 ;
275
304
305
+ __cs149_mask selected;
306
+ __cs149_vec_float reg_x;
307
+
308
+ for (int i=0 ; i<N; i+=VECTOR_WIDTH) {
309
+ int N_SELECT = VECTOR_WIDTH;
310
+ if (i+VECTOR_WIDTH > N)
311
+ N_SELECT = N-i;
312
+ selected = _cs149_init_ones (N_SELECT);
313
+ reg_x = _cs149_vset_float (0 );
314
+ _cs149_vload_float (reg_x,values+i,selected);
315
+
316
+ int vec_width = VECTOR_WIDTH;
317
+
318
+ while (vec_width > 1 )
319
+ {
320
+
321
+ // add
322
+ _cs149_hadd_float (reg_x,reg_x);
323
+ _cs149_interleave_float (reg_x,reg_x);
324
+ vec_width >>=1 ;
325
+ }
326
+ ans += reg_x.value [0 ];
276
327
}
277
328
278
- return 0.0 ;
329
+ return ans ;
279
330
}
280
331
0 commit comments