Skip to content

Commit

Permalink
pipe: update densify script for R12
Browse files Browse the repository at this point in the history
Adapt to new column naming changes.
  • Loading branch information
vincent-octo committed Nov 17, 2023
1 parent 77264c4 commit 65a58b4
Showing 1 changed file with 16 additions and 16 deletions.
32 changes: 16 additions & 16 deletions pipeline/risteys_pipeline/finngen/densify_first_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@
----------
- First events
First-event file with a matrix-like structure:
. columns: endpoints with additional columns for age, year, number of events
. columns: endpoints with additional columns for age, day, number of events
. rows: one individual per row
Source: FinnGen data
Output
------
Outputs a Feather file in narrow format with all control information discarded.
Outputs a Parquet file in narrow format with all control information discarded.
- Dense first events
Feather v2 format
. columns: individual FinnGen ID, endpoint, age, year, number of events
Parquet format
. columns: individual FinnGen ID, endpoint, age, day, number of events
. rows: one row per event, so all the events from the same individual span multiple rows
"""
Expand All @@ -37,7 +37,7 @@
from pathlib import Path

import pyarrow
import pyarrow.feather as feather
import pyarrow.parquet as parquet


# How the controls, cases, and excluded controls are coded in the input file
Expand All @@ -53,7 +53,7 @@
# Output notation: control=1, case=1, excluded control=2
"CONTROL_CASE_EXCL",
"AGE",
"YEAR",
"APPROX_EVENT_DAY",
"NEVT"
]

Expand All @@ -68,7 +68,7 @@ def cli_parser():
)
parser.add_argument(
"-o", "--output",
help="path to output 'densified' file (Feather v2)",
help="path to output 'densified' file (Parquet)",
required=True,
type=Path
)
Expand Down Expand Up @@ -97,15 +97,15 @@ def main():

# Find endpoint columns
endpoints = list(filter(
lambda c: c + "_AGE" in in_header and c + "_YEAR" in in_header and c + "_NEVT" in in_header,
lambda c: c + "_FU_AGE" in in_header and c + "_APPROX_EVENT_DAY" in in_header and c + "_NEVT" in in_header,
in_header))

# Initialize arrays that will be used to make the Feather output file
# Initialize arrays that will be used to make the Parquet output file
fgid_values = []
endpoint_values = []
kind_values = []
age_values = []
year_values = []
day_values = []
nevt_values = []

# Get the endpoint data for each individual
Expand All @@ -127,18 +127,18 @@ def main():

# Get the event info
if args.keep_all or kind == CASE:
col_age = in_header[endp + "_AGE"]
col_year = in_header[endp + "_YEAR"]
col_age = in_header[endp + "_FU_AGE"]
col_day = in_header[endp + "_APPROX_EVENT_DAY"]
col_nevt = in_header[endp + "_NEVT"]
val_age = records[col_age]
val_year = records[col_year]
val_day = records[col_day]
val_nevt = records[col_nevt]

fgid_values.append(val_fgid)
endpoint_values.append(endp)
kind_values.append(int(kind))
age_values.append(float(val_age))
year_values.append(int(val_year))
day_values.append(val_day)
nevt_values.append(int(val_nevt))

in_file.close()
Expand All @@ -149,12 +149,12 @@ def main():
endpoint_values,
kind_values,
age_values,
year_values,
day_values,
nevt_values,
],
names=OUT_HEADER
)
feather.write_feather(out_table, args.output, version=2)
parquet.write_table(out_table, args.output)


if __name__ == "__main__":
Expand Down

0 comments on commit 65a58b4

Please sign in to comment.