Skip to content

Commit

Permalink
handle binary data separtedw
Browse files Browse the repository at this point in the history
  • Loading branch information
liuzhiqiang authored and liuzhiqiang committed Mar 18, 2016
1 parent 39a4217 commit 88f5c68
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 64 deletions.
141 changes: 79 additions & 62 deletions gbdt/tdata.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,78 +15,80 @@

#define DT_LINE_LEN 0x100000

/* ------------------------------------
* brief : heap_up
* l : index of ele need to heap up
* ------------------------------------ */
static void heap_up(DPair * h, int l){
int p;
DPair t = h[l];
while (l > 0){
p = (l - 1) >> 1;
if (h[p].val > h[l].val){
h[l] = h[p];
}
l = p;
}
h[l] = t;
}

/* ----------------------
/* -------------------------
* brief : heap down
* l : length of h
* ---------------------- */
static void heap_down(DPair * h, int l){
if (l < 2){
* p : index of ele down
* l : length of heap
* ------------------------- */
static void heap_down(int * ids, double * vals, int p, int l){
if (l < ((p + 1) << 1)) {
return;
}
int p = 0, r = 1;
DPair d = h[0];
int r = (p << 1) + 1;
int id = ids[p];
double val = vals[p];
do {
if ((r < l - 1) && (h[r].val > h[r + 1].val)){
if ((r < l - 1) && (vals[r] > vals[r + 1])){
r += 1;
}
if (h[p].val <= h[r].val){
if (vals[p] <= vals[r]){
break;
}
h[p] = h[r];
ids[p] = ids[r];
vals[p] = vals[r];
p = r;
r += r + 1;
}while (r < l);
h[p] = d;
ids[p] = id;
vals[p] = val;
}

static void heap_sorted(DPair * h, int l){
int i, t;
DPair d;
static void heap_sorted(int * ids, double * vals, int l){
int p, t;
int id;
double val;
// make heap
p = (l - 2) >> 1;
while (p > 0){
heap_down(ids, vals, p, l);
p = (p - 1) >> 1;
}
heap_down(ids, vals, 0, l);
// make sorted
do {
d = h[l - 1];
h[l - 1] = h[0];
h[0] = d;
id = ids[l - 1];
ids[l - 1] = ids[0];
ids[0] = id;

val = vals[l - 1];
vals[l - 1] = vals[0];
vals[0] = val;

l -= 1;
heap_down(h, l);
heap_down(ids, vals, 0, l);
} while (l > 1);
}

static DTD * load_ds(char * input, Hash * hs, int f){
static DTD * load_ds(char * input, Hash * hs, int f, int bin){
FILE * fp = NULL;
if (NULL == (fp = fopen(input, "r"))){
fprintf(stderr , "can not open file \"%s\"\n", input);
if (f == 1){
return NULL;
}
}

// Dataset Data Struct pointer
DTD * ds = (DTD*)malloc(sizeof(DTD));
memset(ds, 0, sizeof(DTD));

// vals for read data
char buffer[DT_LINE_LEN] = {'\0'};
char *token, *string = buffer;
int i, row, tok, offs, id, hsize;
int *tmp_cnt, *fea_cnt = (int*)malloc(sizeof(int) * hash_size(hs));
memset(fea_cnt, 0, sizeof(int) * hash_size(hs));
hsize = hash_size(hs);

// 1th scan for counting :
// rows , features, length of each feature, nonmissing value count
while (NULL != fgets(buffer, DT_LINE_LEN, fp)){
string = trim(buffer, 3);
strsep(&string, "\t");
Expand All @@ -111,17 +113,19 @@ static DTD * load_ds(char * input, Hash * hs, int f){
tok += 1;
}
}
strsep(&string, "\t");
if (bin == 0){
strsep(&string, "\t");
}
}
row += 1;
}

// malloc space for store data
ds->col = hash_size(hs);
ds->row = row;
ds->y = (double*)malloc(sizeof(double) * row);
memset(ds->y, 0, sizeof(double) * row);
ds->l = (int*)malloc(sizeof(int) * ds->col);
ds->cl = (int*)malloc(sizeof(int) * ds->col);
memset(ds->y, 0, sizeof(double) * row);
memset(ds->l, 0, sizeof(int) * ds->col);
memset(ds->cl, 0, sizeof(int) * ds->col);
for(i = 1; i < ds->col; i++){
Expand All @@ -131,65 +135,78 @@ static DTD * load_ds(char * input, Hash * hs, int f){
}
}
free(fea_cnt); fea_cnt = NULL;
ds->vals = (DPair*)malloc(sizeof(DPair) * tok);
memset(ds->vals, 0, sizeof(DPair) * tok);
ds->ids = (int*)malloc(sizeof(int) * tok);
memset(ds->ids, 0, sizeof(int) * tok);
if (bin == 0){
ds->vals = (double*)malloc(sizeof(double) * tok);
memset(ds->vals, 0, sizeof(double) * tok);
}
if (f == 1){
ds->id_map = (char(*)[FKL])malloc(FKL * ds->col);
memset(ds->id_map, 0, FKL * ds->col);
}

// rewind fp
rewind(fp);

row = 0;
// 2nd scan to load data
while (NULL != fgets(buffer, DT_LINE_LEN, fp)){
string = trim(buffer, 3);
token = strsep(&string, "\t");
ds->y[row] = atof(token);
while (NULL != (token = strsep(&string, "\t"))){
id = hash_find(hs, token);
if (f == 1 && (!ds->id_map[id][0])){
strncpy(ds->id_map[id], token, FKL - 1);
}
if (id != -1){
offs = ds->cl[id];
ds->vals[offs + ds->l[id]].id = row;
token = strsep(&string, "\t");
ds->vals[offs + ds->l[id]].val = atof(token);
heap_up(ds->vals + offs, ds->l[id]);
ds->ids[offs + ds->l[id]] = row;
if (bin == 0){
token = strsep(&string, "\t");
ds->vals[offs + ds->l[id]] = atof(token);
}
ds->l[id] += 1;
}
if (f == 1 && (!ds->id_map[id][0])){
strncpy(ds->id_map[id], token, FKL - 1);
}
}
row += 1;
}
fclose(fp);

for (i = 0; i < ds->col; i++){
DPair * d = ds->vals + ds->cl[i];
heap_sorted(d, ds->l[i]);
if (bin == 0){
for (i = 0; i < ds->col; i++){
if (ds->l[i] > 1){
for (int j = 0; j < ds->l[i]; j++){
if (ds->vals[ds->cl[i] + j] != 1.0){
goto sort;
}
}
// all vals equal 1.0, do not sort !!!
continue;
sort:
heap_sorted(ds->ids + ds->cl[i], ds->vals + ds->cl[i], ds->l[i]);
}
}
}

return ds;
}


DTD *(*load_data(char * train_input, char * test_input))[2]{
DTD *(*load_data(char * train_input, char * test_input, int binary))[2]{
if (!train_input){
return NULL;
}
Hash * hs = hash_create(0x100000, STRING);
DTD * train_ds = load_ds(train_input, hs, 1);
DTD * train_ds = load_ds(train_input, hs, 1, binary);
if (!train_ds){
fprintf(stderr, "load train data failed\n");
return NULL;
}
DTD * test_ds = load_ds(test_input, hs, 0);
DTD * test_ds = load_ds(test_input, hs, 0, binary);
if (!test_ds){
fprintf(stderr, "no test data or read failed\n");
}
DTD *(*ds)[2] = (DTD*(*)[2])malloc(sizeof(void *) * 2);
(*ds)[0] = train_ds;
(*ds)[1] = test_ds;

return ds;
}

Expand Down
5 changes: 3 additions & 2 deletions gbdt/tdata.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@ typedef struct {
int col; /* feature count of data */
int * l; /* row cnt of per feautre */
int * cl; /* cumulative row cnt .. */
DPair * vals; /* row id and feature value */
int * ids; /* row ids of feature */
double * vals; /* row feature value */
char (*id_map)[FKL]; /* feature id name mapping */
}DTD;

DTD *(*load_data(char * train_input, char * test_input))[2];
DTD *(*load_data(char * train_input, char * test_input, int binary))[2];

void free_data(DTD *ts);

Expand Down

0 comments on commit 88f5c68

Please sign in to comment.