Skip to content

Commit

Permalink
added to data ingestion.py
Browse files Browse the repository at this point in the history
  • Loading branch information
fundsan committed Sep 13, 2021
1 parent 852b437 commit 219790d
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 26 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ practicemodels/**
production_deployment/**
sourcedata/**
testdata/**
ingestion/**

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
3 changes: 3 additions & 0 deletions .student_bashrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
export PATH=/data/bin:$PATH
export PYTHONPATH=/data/bin
source /data/bin/activate
58 changes: 32 additions & 26 deletions ingestion.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,32 @@
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime




#############Load config.json and get input and output paths
with open('config.json','r') as f:
config = json.load(f)

input_folder_path = config['input_folder_path']
output_folder_path = config['output_folder_path']



#############Function for data ingestion
def merge_multiple_dataframe():
#check for datasets, compile them together, and write to an output file



if __name__ == '__main__':
merge_multiple_dataframe()
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime




#############Load config.json and get input and output paths
with open('config.json','r') as f:
config = json.load(f)

input_folder_path = config['input_folder_path']
output_folder_path = config['output_folder_path']



#############Function for data ingestion
def merge_multiple_dataframe():
#check for datasets, compile them together, and write to an output file
filenames = os.listdir(os.getcwd()+input_folder_path)
df_list = pd.DataFrame(columns=['corporation','lastmonth_activity','lastyear_activity','number_of_employees','exited'])
for each_filename in filenames:
df1 = pd.read_csv(os.getcwd()+input_folder_path+each_filename)
df_list=df_list.append(df1)

result=df_list.drop_duplicates()
result.to_csv(os.getcwd()+output_folder_path+'finaldata.csv', index=False)

if __name__ == '__main__':
merge_multiple_dataframe()

0 comments on commit 219790d

Please sign in to comment.