From 6317a73b8f041edb1f800af10eef67e22d7a46be Mon Sep 17 00:00:00 2001 From: Miracle Date: Thu, 17 Mar 2022 20:35:56 +0100 Subject: [PATCH] python etl --- .DS_Store | Bin 0 -> 6148 bytes .vscode/settings.json | 3 + README.md | 21 + cake_data.csv | 242 +++++ docker-compose.yml | 18 + etl/__init__.py | 0 etl/__pycache__/__init__.cpython-38.pyc | Bin 0 -> 135 bytes etl/__pycache__/extractor.cpython-38.pyc | Bin 0 -> 1033 bytes etl/__pycache__/loader.cpython-38.pyc | Bin 0 -> 1531 bytes etl/__pycache__/master.cpython-38.pyc | Bin 0 -> 677 bytes etl/__pycache__/models.cpython-38.pyc | Bin 0 -> 1616 bytes etl/__pycache__/reports.cpython-38.pyc | Bin 0 -> 3729 bytes etl/__pycache__/transformer.cpython-38.pyc | Bin 0 -> 3519 bytes etl/__pycache__/utils.cpython-38.pyc | Bin 0 -> 1003 bytes etl/extractor.py | 27 + etl/loader.py | 43 + etl/master.py | 22 + etl/models.py | 48 + etl/reports.py | 122 +++ etl/transformer.py | 151 +++ etl/utils.py | 34 + main.py | 6 + reports/reports.html | 882 ++++++++++++++++++ reports/test_reports.html | 36 + requirements.txt | 5 + tests/__init__.py | 1 + tests/__pycache__/__init__.cpython-38.pyc | Bin 0 -> 137 bytes .../__pycache__/test_extractor.cpython-38.pyc | Bin 0 -> 875 bytes tests/__pycache__/test_loader.cpython-38.pyc | Bin 0 -> 1548 bytes tests/__pycache__/test_models.cpython-38.pyc | Bin 0 -> 2910 bytes tests/__pycache__/test_reports.cpython-38.pyc | Bin 0 -> 1161 bytes .../test_transformer.cpython-38.pyc | Bin 0 -> 3532 bytes tests/test_extractor.py | 21 + tests/test_loader.py | 55 ++ tests/test_models.py | 89 ++ tests/test_reports.py | 43 + tests/test_transformer.py | 166 ++++ 37 files changed, 2035 insertions(+) create mode 100644 .DS_Store create mode 100644 .vscode/settings.json create mode 100644 README.md create mode 100644 cake_data.csv create mode 100644 docker-compose.yml create mode 100644 etl/__init__.py create mode 100644 etl/__pycache__/__init__.cpython-38.pyc create mode 100644 etl/__pycache__/extractor.cpython-38.pyc create mode 100644 etl/__pycache__/loader.cpython-38.pyc create mode 100644 etl/__pycache__/master.cpython-38.pyc create mode 100644 etl/__pycache__/models.cpython-38.pyc create mode 100644 etl/__pycache__/reports.cpython-38.pyc create mode 100644 etl/__pycache__/transformer.cpython-38.pyc create mode 100644 etl/__pycache__/utils.cpython-38.pyc create mode 100644 etl/extractor.py create mode 100644 etl/loader.py create mode 100644 etl/master.py create mode 100644 etl/models.py create mode 100644 etl/reports.py create mode 100644 etl/transformer.py create mode 100644 etl/utils.py create mode 100644 main.py create mode 100644 reports/reports.html create mode 100644 reports/test_reports.html create mode 100644 requirements.txt create mode 100644 tests/__init__.py create mode 100644 tests/__pycache__/__init__.cpython-38.pyc create mode 100644 tests/__pycache__/test_extractor.cpython-38.pyc create mode 100644 tests/__pycache__/test_loader.cpython-38.pyc create mode 100644 tests/__pycache__/test_models.cpython-38.pyc create mode 100644 tests/__pycache__/test_reports.cpython-38.pyc create mode 100644 tests/__pycache__/test_transformer.cpython-38.pyc create mode 100644 tests/test_extractor.py create mode 100644 tests/test_loader.py create mode 100644 tests/test_models.py create mode 100644 tests/test_reports.py create mode 100644 tests/test_transformer.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..74fc874c53ca19625ff4eea7ef2e127df7c36710 GIT binary patch literal 6148 zcmeHK&5qMB5FT&4G%Z?D5Mnvzg2Z82btz&G2q~2Ium>bW5F7w?lO|##(zqn)mQspx z;ThOh;7ND{9)t&g&p(UOv`Cx~LLO=SP3)O*{5iH`B4WL1a6r^1A`2?8-a@fKM7z*6 z+0r!^flTHYQa~Q%(ET-UQ(+h|4E$>hh_$;zBdpjr6jFKpjwqwkqnMBUnEUV&Tf{x+ z%^y-=5naK0=ZF@uDcyp_X-pH88EW=vj;FYi;|t=>N;zuq%r|F5#>G`=r5FGPcfdXlK*=p zf5qFMWfRTwSAP;mS>EYy>-LXKwE0)2!#EFT~ShFF*0M{gm_g zm+kq_qHs2BUwz1v+>4ShmBCRE!sX@jC<*wi$EQh<$vm!Vu&k!l9Ja457M(r2+r7Qq zvloYlu5m9YP z*jt*)sR}VEA*L79VS;jcNBIT{%Y<*kD}?tN+43_je`4qFWN?K6J$07Y1B@633D|*dq;j-x)XC0n$j>}7+7VXrXJhk{2%=O{=XVzo(uzqf&YpDR`0w0 z9)_gP*0sTjv(|g`kg7ZP*Ng(<$h(HF6K#l_t7qb9~6oz01O-8?!3`HPe1o2BpKeRZts93*3 z-zC31FDE}SrC2{XBQYl@H7`9?KeZ%BKR!M)FS8^*Uaz3?7Kcr4eoARhsvXFb&p^xo E0O}bXS^xk5 literal 0 HcmV?d00001 diff --git a/etl/__pycache__/extractor.cpython-38.pyc b/etl/__pycache__/extractor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5255c8dad423f2d2fee99deab06fee6a7e3b8939 GIT binary patch literal 1033 zcmZ8fL2uJA6t~18f-12xK)F~UcGpa4QfwEtDm!gg z#2?_sX@ALAPW*+Pc+YJ^*pnRRJ=>qZ_kBLz+UgOANBr?;#R&P0NizaSUcfL<;3J8o zIf-#*Ii0bXQSye!K!#UDhLksTDqXBi{+sxP$vu4zoJ5W_5rjo==!P%cw zG8tTjaj06V{SYvr(voecbYusRBiDU>W_6O`T)X~RgbUZ{WLQ?%@bp|S^*m_6PxgfE@#}73!N&CSM=&5FZGGpt24xDve@Iu zh1-(bsrnCKbLB{fz-Pc*XjDGYEl943?!a{Oba-r(Hp8>wVfn4d%S4)CI!*Fi6(?$_ zY(89*EcNdE+(klUMP`M-!^XGYS>CG!+TARJ5pAvPLKi88otr=#P#Uk%EBLXbBBxZ+ zOZH>uf?9SORNj#9?1I^_VwaQ>+rs%Hi2IDZBS+*H8T1(-!7_=q={Ak% z^1(m7n;S>)R>7YlnJFP$Pl#D57w}Sr>kIK^k>u-@z*w!jfTz(v7hTLx@@PCBk7s`VjbE&+^azYE*Z$f4>Jjn{7rzg% zk-ISU1_(|#m87HzrKmH>5|4Y_mwp)}K^Z0?*aIFOlce*I@D7iT36G9g5^=Ue2Ho!v zgAAzk9~atef2X)&w!4|V6i;d{r0utAHLjnkGDq*<2j;Q~Q=bQ6$O+~26f?otUh{V^ zhOt>!mB@{bO&yzk5%bJsyO|bl(*`+QrMKZyRtnq6Tjx(ibs?e?KW>d)Xrc7z?P#lh zQ^`8xdX(>HQi^IUM#9Ju+JqWT4pjuFaB2pE_-?(FwGUf&Vd`5T1W70-3FDFQgnx~Q zfU}PbF+2{EaE=}CAX+wj>~4C17-8Zm7Gt|cXp@$JSB`+p2kMxH1L!g+Gp2in91%)p zbVgDD3xcxdvqJ)*dcI%iIG35$u`4q+Dy#HftxCZiZgEkaMR9TXo*L`5#R|hMGF)2< z^gq8v`fIT)UW*tjiE9;4tHQ*%SyW?ZT%3aUa?q+ilr`|IuZ7%m3$+54{~k{vmu-}$ z1;k3zRaof|1nK%TWX**Z&*6MOM*_|VVuu(y{O*Hdfb2&U>BYTcG9w3ciyYB+)UX5Z z#6JaG!NW%nX9;GGj9?4gsSwcaG;jL!Z!7@}7QRul$`B7l1ARFYEInH^}?p9nc~0S=4}Gov z%gM$~jE%Gnfl zcTvO*tP%M{f{3oN9*bz7HLGW9-5$^fg9s^AWu-_{+e@+arV{mjn!cK5vfWWgAcd$| zf48osx`6AeC>Ebft076%IS}aKaE|De^Ui`zZw@Dbq6=rwl{Nf@@beUvP#ti$A-0Ru f?jrZHZD(M(Kiy-a^)}rN}>P4NOkso6L literal 0 HcmV?d00001 diff --git a/etl/__pycache__/master.cpython-38.pyc b/etl/__pycache__/master.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2d9a247650830b006f45faf7a70f47ec45c5825 GIT binary patch literal 677 zcmYjOxo+Gr5G6(3yPI!N<6=`p{UQi*xiefiHW08`C=%^mfD(lv1v`OU*`JYOUFMfm z##O5HsWL<5@DLolIXHZeyjZPrhW34N^yw#O?2&?}CNOx7+rCBDu!d{aa>+U2KnJam z!egPsHj@W=;Bmt+O9^3tRIkJHByH)y@D0@rkJ#MQCvj zRc9{^v_6c_RBymxA>ViX;I@~0t!f(a*CGT&@*QrwLFd@W?)WWteB_7VfpFl&7#u>6 z#V8KZ1LMrackDJC!$U$*!;xI@OK;pZ5Y}5-t#;ROn#6Wr3R)oDAjsZ&GKxBJT^LR6BDQnjMc`_2kY|@N#QD8yI zR)TiS1WjWCE{)&R*73UyL%a&-YZRDAeDFvBJ_%X$HiiJd@O?lM)KUmo1`%pK)N)*j z-}|JGa)EM!^1!0l5VaIhBgimRJyX1-XBZYaSBJW}ewLTmoXX^!OI9{EVx^ju7FU9* zmkpDhgB@d3GtJJITr)7qQu6a9Lt7-5LKuvV;x%i>zV4y(IQ8um=mIBaIvB5+tg0l& zw5m*ARe0A@Sh=XG^CcBGD`{2Xy=V>^X0mT z<<1Y^ReP^?-WM&;^^_=3%;H2u|4CkOk- zM`oZj$B0l-;X}J$J^iVDhaLWYIyq^XZYS?1yXsQnCD2alQz`_;kWE-8CN7HY^Tn0U zaKp#gTN{SVkD{SJ^b7pl%6}8!9l~!3aDsJZDd5uYlb9teg=pJDi>oC}Z^n8o-}KO) zIeQO@V(CzzC3lv-tvW;w5IJVh+Znt2i_JOsv=B`}zeSGTm)&b_H z{0a~2t`=;37mwAfOT?HI*o-bkrytrKU+mtb^e(pKvTU$=xl4AdH!Sk;(Sp@{##vkP z*~)9)mM-TdUMN+kw=C{U>+x;cOPh9kv~~GOTiy~#e}d5QnjYY!hjy_4K)q%MTfXi& zCgid+qpG43Pj^9StIGMU0|c26{}IkP6VEJ5odYJCA}iemAK12%KC>r(j>8k|j!QXm zY2DxOZFei3MbOOX~Wrj#Bxk-UWMWdue2^fndXGpHR zAJoiHvKI77y5`VxT|l}N$iLI$UVF-+hn@mB3DEaumn%_LE?r_~-psx?A8&u}&5vI# zFM9?&|9x|{|F31k_&0XWKNfU8h9bA2Vg@6T5gR>&pln7YHhZR)t;mY)o(;0iYEdnA zdXAQzs2;mLml&TKtj^qHgSkg$ufZBSM$7va<}_L)cUhpg3S$nk{fAI0Xvv?xCB@tN zAWafJz`^Eb@RWa^CZqI^A|9ai`Nx6I$53PsDsJ=$GkWF-_RLIX4Q+05`=-Hb_+$19 z*e7$2Ydr^g^%sP>tZ{7j>d^C`x5S#zbJ;RmIW}OwSJ-RNddz}-U(KB__#_o7Tf_a( zq9f@+sK)g7$$k)pjBW=pr>UTS=A$6V>hJvSldVTvzu9itxeccc^7%r@dAmMVakSm4<*$#p^6_B)P^7URBcenD{3|dT#1d9bs6f@0JQ`m1?i)i~ z0hO2UwZD>F$o8}LR(g;`X~1NAFb<-KCnMhGDymkW9Oka?he@b>AD56WRL06R;=nJv zQov|^t^sgI!{>5=M^9=jO~jF=ynI5CbIZ6yvl&!Gp># ze?fmq?Lu_NIp+sd4-CVISK2u_(3njpJkc)6Yue#?Z4jlBw=A&&TM*Zv%3ZYd)Bdh@ zOi{gtFP625uTmd1-hv^Cv1&L?;*soU<-r$};Pj5*YATWerh)lmgHVT}5C2aP5bC?e zbvWjA<2jiT27$0kpl@F{9uy-r-5R?lb8Z^wlr7`Q4R94iq>wSy>}l*8>O8$ zw*&nTo?A1kw7soumsAac&b((2uwdFFJ%o zQDU`HtAsDK8!kj;6=M*Q`l}VQRYuu^PB9|YVZ=XL>!*TovGM-o8I>vKr+yR+o<5{~ zs7E55ChV>byN%3R>w4M!&OC5S`yn9l?g0-+W3@s1X~Z7FreR2i8Q-8E+?zamxJJ9# zQm0)S!X&kh5?xQ=_si^#8h2r&jg12hyvgd6Y88CTiec9EUV069kIw0OEvE=Z9hMlN zx&4$M%G`rhLIsI7pH6^7GG(n5B>n!bbccB?Mj7_>9y0enal=TAq43O;Dohfk{|y7;=S zP3VQue##CP=fU|p1IURS05qQ*+p{YS&@>j^Y_suyM!-9dKYaY)@oyhu9DLM(Swt7X zyEBdBKpbiWWB*IwD^dmVq#D^#ftt#qTLAnQb92_!OreZ%P+>49K>o@qySp0SUH$7F zy87A1>gU;;)w`YPV8A6#2fBJXfk4e3UvyIYGIw;^i@xi$7e7<>UV-X46<|~r>kX$- zba;nOA|6N}Fp&1?TGm)Y80?1}E(5o?Bj1L-2n2m@CFw!#LJ$s-!igI=R+|6;P{kc; zo?Q_a19G{HslqRhz*63Z%5Y4}{?W9|AMK^44o4S)(Y0L1Gpp-j!St0-#QQ~u;yMV* zID(r{krPu{$OgYAy0yM0kBn3DoFHdC25z+l$sB^m$^KJ8>r<{IJtIGpRLT(3_5g^V zq;Q=@oY63$TnNbF$R?;}6ZBvV*JF|0W6YW~NfnKeE=rzAey3VtBKRN#k^xS~NedHA zCBu?RCMA{JM=E)S^PjjEF5_uHsS*_c^8iv-EmG}Z8!lTL7lF2>7kR~<$W(Tq$akR{ z0pWP^7T&hI;Iv(A;Wpj1pxDfKQTv;@XM-EbH?`b_NEL^E$OIbZD=dU%Aa;QT`Ee}P zaBw*l;V4Xk$e#i=62FGAmYJLD_i`KedJ!{T2TwRi<4IS7szmt)8i zgypSwq7>4d?1C_#1}+bTi$G$L*MYylpA9LHYP1UM_L{g6WH$N@(!MixBj9h7F@z%Z zKdFXe-ZEi7FSA=ey^GcoGC_2?@8_Nm1e8sII{A6i_xGkjRJ;)u%nE-OfN+bSwqY2~K-E3LWR@``A7)ZqkR zQmlPjV;P8SnDF(zDHn&5tM&a*hAI`>L%D;D7toI$%pEugkEHgbp7yexZ1Y8Sd(@>FJsAI#~lrb|sqCelTbigAfujD@2qKSdlm=arV&KO{+as)k(R98-s_6MLv0-m?p(=&DqqOGo}uea*$_j&4h z%MS;G(8BYm|K9H3E?L&!sj>PvX#5PXx{rceoaI)*V#cU$=XT-5PT|Hb+75Sfukd5v z)V;h@1aVMw<8BegVbP0w%=(4JecpLt@y?MQ_j&NZ8g^e{U2Dj6Z!#@M^)#)ssvO$d zy_2b02lpn_PV-l+nbp0SJ`r~-F7lzPLp8~>I;q9;S`YR_o$RJcB-1jhbvIKHr! z>Fd0-dBD5q>++EIURZI+`+R`k9$({c;J44$@u8P=Z%3x38dtIq@=K-zneHb%t<#ac z;t|#9y@^*5G;=n$5;%h;?AdR_==9lnoT+G(r%FZjVp$a-5;7gtf=9G)l#WJ~1f?>PZ4Y{Yb+7|7w@NMhdlI^5z*@trkloI(^4$%10l5gPW>&?xF zN=UW&eDhYdU*=WH)#m7Nn&+b26Pu#WH(QT|ygoV5L6T$uElIjCS0);r*~Ut=t}iBl zC%zQ*RF)$m#Nu&Lq1OcMIx4kwWE~=KpR-%mXKx?cbL-fi+b8VMnLDySXUF(GVa$4p zIs(gPy#26tTV-On4AOnG@3}ZFzhn_|6{S%Q?nc$P3A1XK24!6uGSSlCXl)Y(h;or+ zwJ5ZoP9~z{LszchL*h&AEWwiz$gko~R+?HIRMXnjOx&Ad9Y%VsH8KRdj6OwhSskB+ z_;XpG&93b%cmov98UWTXD<@c;T#{sehNK=k2rCosT~rWh5cQaGcEZdrV$Gc|>`(1O z8!@-%j_GwVN=VR5?}X-BK!RF6wRq>HcjyA4M{`>~s@<_QcYgbYRP|0jBgDy7ftt||3 zj-YZnFy=|dwOe8j?HRjm7_LubMV_^rYX`!q`-^TfeXwX5GwB21p=anQzlFN|HWhE7 z&`v4#&yi1VU?9S)zKO!>xh!D8ze3+;Hfn2Z?Vo-y`%a7C8Sbr|gONklrw{S1J>UO8 zgF~7346}}{6YH3rSUBUXt%HNydE(0txeFDrxpx(;JF(<1QHPE4Ykt+fA8JGDEv zkJu4AbZc*pUz9Enu3AUzi6fuW{HA|SYyZCX>kh`Twwa0fy!!?C{`9M+_s}kHHS5;_ z=Kp1GA9tHJbjWSzL@R54W!8ibO<%L#>9{?%p;Y1QuPvi~-9TxCo%j=_B}C|yRPC&` z^7;`Vqpc<{oLWWH)ml&@Sgdz;-tu5wcr+I5G|L&cP8mgs>AZv5&PvS+oC51>_KvoT zLXx(>`Zu1Ty}d07eT;f0kIQS+3bh9r%_iE*t9|77PNvd)@;H@mVpMjh2&kY0s=YCC zP%SC1z>Ka83pUeQhRAV$``&hP_iplV`{SJl@%qBago`>$b9J8M@51mWc-41NSp5Kc z+IIrxx>uO%PWsh=`P*B-_s}51)6MJ9W+b#A z^RWZcgXnYna~Je_pAs2&KoNKLrxw8lMTi+q-fXV`q$g3MFH1CE=ZFb<1HPF&l_axd zytKNI19aWM{X z{^Z7;k8dS6Z~W@#$?ZEgKDl@Q!O&^cX_F?I>2wiUCPeu@np3=rw9E?oHuB$@ceT2$6nPG_i6^^5MDmm|R7cv1cI%4w2Mp95aB5XWh9KS`cW)4Umxgq(JHHmW85xv@+tla!Yvr4y5?B%NhL5J}#woOI(WIVO83 zNKecqDqxqpm)ADl3bulEGeXYzBuC|5BQkz{FrfzR-fUzzN!-)kw9cR>ONpXr8J#j~ YU9`OpK59Pir=*LD9O<)lQ22`TAA{j}mH+?% literal 0 HcmV?d00001 diff --git a/etl/__pycache__/utils.cpython-38.pyc b/etl/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20f1a8361e115edb286e9e9403dae54045811023 GIT binary patch literal 1003 zcmZWnO>Yx15cSxb&32O%fdVQA#3DhZ6|F}9ZM^@0xQ*@kg zP^5%y0KzF}pUF1m5pQ5*JmyV|5l{F6#s*I}NZ$H^w@41IIo9=TIyfAHPh1X$F+v-n znPqgA91#mU^n|XFkKGUS7>+52Gj>GJVT%Cy)aDecviY)^Drqxgb+#|ixmq;SzrEvuY4Fb;MdjK31?pW$cT+cZL(-a&7z z-mg?$aMK%33Z+Cf7Cm8AZ)T-3Po{@14G{WP99Yl6ct=&N%2c@GW8sd(12diwyrLf3TFZz#G zDV36#LHPPc*p}6BA`Cu08w-me^5p{03)jL^u7i91L19GS@A{Ovg^om&P`I?;aYTP< zpZ|BJ7kN!CmCgB?05T)wH0Jd1#ssI20 literal 0 HcmV?d00001 diff --git a/etl/extractor.py b/etl/extractor.py new file mode 100644 index 0000000..ede35e0 --- /dev/null +++ b/etl/extractor.py @@ -0,0 +1,27 @@ +import csv + +from typing import List + + +class Extractor: + def __init__(self, in_file_path: str): + """ + This class extracts data from source file + + Args: + in_file_path: path to the source file + """ + self.in_file_path = in_file_path + + def extract_data(self) -> List[dict]: + """ + Extracts data from CSV file + + Returns: + data as a list of dictionaries + """ + + with open(self.in_file_path, "r") as csvfile: + reader = csv.DictReader(csvfile, delimiter='|') + next(reader) + return list(reader) \ No newline at end of file diff --git a/etl/loader.py b/etl/loader.py new file mode 100644 index 0000000..6ac19cd --- /dev/null +++ b/etl/loader.py @@ -0,0 +1,43 @@ +from typing import List + +import mongoengine as me + +from .models import CakeModel, CakeMongoOrm + + +def connect(): + """ + Connects to the database + """ + me.connect("cakes") + + +class Loader: + def __init__(self, cake_data: List[CakeModel], test_mode: bool = False): + """ + This class loads transformed data into the database + + Args: + cake_data: transformed data + test_mode: live mode or unit testing mode + """ + + if not test_mode: + connect() + + self.cake_data = cake_data + + def load_data(self): + """ + Inserts data into the database + """ + + print("Preparing data...") + cakes = [CakeMongoOrm(**data.dict()) for data in self.cake_data] + + CakeMongoOrm.objects.delete() + + print("Inserting data into the database... please wait") + CakeMongoOrm.objects.insert(cakes) + + print("Data loaded into the database successfully!") diff --git a/etl/master.py b/etl/master.py new file mode 100644 index 0000000..fe9c269 --- /dev/null +++ b/etl/master.py @@ -0,0 +1,22 @@ +from .extractor import Extractor +from .loader import Loader +from .transformer import Transformer +from .reports import Report + + +def run_etl(input_file: str): + """ + Runs whole ETL pipeline + + Args: + input_file: path to the source file + """ + extractor = Extractor(input_file) + transformer = Transformer(extractor.extract_data()) + loader = Loader(transformer.transform_data()) + + loader.load_data() + + # create reports + report = Report() + report.create_report() diff --git a/etl/models.py b/etl/models.py new file mode 100644 index 0000000..351addb --- /dev/null +++ b/etl/models.py @@ -0,0 +1,48 @@ +from typing import Optional + +import mongoengine as me +from pydantic import BaseModel, Field + +VALID_CAKE_FLAVORS = [ + "butter", + "carrot", + "black forest", + "avocado", + "vanilla", + "caramel", + "rainbow", + "chiffon", + "cream", + "babka", + "sponge", + "apple", + "strawberry", + "biscuit", + "chocolate", +] + +VALID_UNITS = ["mm", "in", "m"] + + +class CakeMongoOrm(me.Document): + """ + Mongoengine model of Cake document + """ + + entry_id = me.IntField(required=True, unique=True) + name = me.StringField(null=True, choices=VALID_CAKE_FLAVORS) + diameter_in_mm = me.FloatField(required=True) + vegan = me.BooleanField(null=True) + original_unit = me.StringField(choices=VALID_UNITS, required=True) + + +class CakeModel(BaseModel): + """ + Pydantic model of a cake for data validation + """ + + entry_id: int = Field(description="The entry id of the cake") + name: Optional[str] = Field(description="Name (or type) of the cake", default=None) + diameter_in_mm: float = Field(description="Diameter of the cake in millimeters") + vegan: Optional[bool] = Field(description="Specifies if cake is vegan or not", default=None) + original_unit: str = Field(description="The original unit of cake's diameter") \ No newline at end of file diff --git a/etl/reports.py b/etl/reports.py new file mode 100644 index 0000000..37cb4b4 --- /dev/null +++ b/etl/reports.py @@ -0,0 +1,122 @@ +from datetime import datetime +from typing import List + +from mongoengine.queryset.visitor import Q + +from .loader import connect +from .models import CakeMongoOrm + + +class Report: + def __init__( + self, data: List[dict] = [], + caption: str = 'Cake Reports with Invalid Name or Vegan', + bg_color: str = '#FADBD8', + path: str = None): + ''' + This class creates reports + + Args: + data: a list of dictionaries + caption: a string to caption report + bg_color: string to give color to report + path: the path to write report to + ''' + self.data = data if data else self.get_data_from_mongo() + self.caption = caption + self.bg_color = bg_color + self.path = path if path else './reports/reports.html' + + def write_to_file(self, content: str): + ''' + Writes the html string to a html file + ''' + + # Save the HTML code + file_obj = open(self.path, 'w') + file_obj.write(content) + file_obj.close() + + + def create_html_table(self) -> str: + ''' + Creates table data for reports + + Returns: + a string of html table + ''' + + table: str = "\n" + table += "\n" + table += '\n' + for k in self.data[0].keys(): + table += '' + table += '\n' + + table += " \n" + for row in self.data: + for k in row.keys(): + table += '\n' + table += '\n' + + table += '\t
" + self.caption + "
' + k.capitalize() + '
' + str(row[k]) + '
\n' + return table + + + def create_report(self): + ''' + Creates html data for reports and calls the method that writes to html file + ''' + + # Start the page + content = ''' + + + ''' + self.caption + ''' + + +
+ \n + ''' + + # Add content to the body + content += self.create_html_table() + content += '
' + + content += "\t\n" + content += "\t\t\n" + content += '\t\t\n' + content += '\t
SummaryTimestampStatus
Cake reports' + datetime.now().strftime("%d-%m-%Y, %H:%M") + 'Success
\n' + + # Close the body and end the file + content += ''' +
+ + + ''' + + self.write_to_file(content) + + print(f"Reports created successfully, please open '{self.path}' to view") + + def get_data_from_mongo(self): + ''' + Gets a list of possible cakes filled in error from mongo whose data might not make sense, + precisely, cake data with invalid name or vegan + + Returns: + a list of dictionaries containing cake data + ''' + + connect() + + cake_objects = CakeMongoOrm.objects(Q(name=None) | Q(vegan=None)) + + return [{ + 'entry_id': cake.entry_id, + 'name': cake.name, + 'diameter_in_mm': cake.diameter_in_mm, + 'vegan': cake.vegan, + 'original_unit': cake.original_unit + } for cake in cake_objects ] + \ No newline at end of file diff --git a/etl/transformer.py b/etl/transformer.py new file mode 100644 index 0000000..24ee192 --- /dev/null +++ b/etl/transformer.py @@ -0,0 +1,151 @@ +from string import punctuation +from typing import List, Optional + +from .models import CakeModel +from .utils import split_text, get_base_unit, is_number, value_to_mm + + +class Transformer: + def __init__(self, raw_data: List[dict]): + """ + This class transforms extracted data according to the desired model + + Args: + raw_data: extracted data + """ + self.raw_data = raw_data + + def transform_data(self) -> List[CakeModel]: + """ + Transforms data + + Returns: + transformed data as a list of models + """ + transformed_cakes = list() + for in_cake in self.raw_data: + out_cake = self.transform_single_item(in_cake) + if out_cake: + transformed_cakes.append(out_cake) + return transformed_cakes + + def transform_single_item(self, input_item: dict) -> Optional[CakeModel]: + """ + Transforms single item of extracted data + + Args: + input_item: part of extracted data + + Returns: + model if transformation was successful + """ + + original_unit, diameter = self.process_diameter( + unit=input_item.get('diam_unit'), + diameter=input_item.get('cake_diameter') + ) + + new = { + 'original_unit': original_unit, + 'diameter_in_mm': diameter, + 'entry_id': input_item.get('entry'), + 'name': self.process_name(input_item.get('flavor')), + 'vegan': self.process_vegan(input_item.get('is_cake_vegan')), + } + + return CakeModel(**new) if diameter and original_unit else None + + + def process_diameter(self, unit, diameter): + ''' + Process the unit and diameter + + Args: + unit: the diameter unit + diameter: the diameter + + Returns: + original unit and processed diameter + ''' + + NON_MM_UNITS = ['in', 'm'] + diameter = diameter.strip().lower() + unit = unit.strip().lower() + + # when no units are mentioned, set to milimeters + if unit in ['', '"']: + unit = 'mm' + + # get diamter value + # if diameter value is irrecoverable (a complete string), discard + if diameter[0].isalpha() and diameter[-1].isalpha(): + return None, None + + # if diameter has units, split into diameter and units + elif diameter[-1].isalpha(): + diameter_detials = list(split_text(diameter)) + + # if units doesn't match, discard record + if get_base_unit(unit) != get_base_unit(diameter_detials[1]): + return None, None + + # if they match, continue + else: + # if they're not millimeters, convert + if get_base_unit(unit) in NON_MM_UNITS: + diameter = value_to_mm(value=float(diameter_detials[0]), unit=get_base_unit(unit)) + + # if they're in millimeters, return diameter + else: + diameter = diameter_detials[0] + + # check case diameter is in the form '2.43"' convert to ['2.43', ''] + elif diameter[-1] in punctuation: + diameter_detials = diameter.split(diameter[-1]) + diameter = diameter_detials[0] + + # when diameter has no units + else: + # check if unit is in millimeters, else convert + if get_base_unit(unit) in NON_MM_UNITS: + diameter = value_to_mm(value=float(diameter), unit=get_base_unit(unit)) + + + return get_base_unit(unit), diameter + + + def process_name(self, value): + ''' + Process and return desired cake flavor + + Args: + value: the flavour of cake + + Returns: + the accepted flavour or name if it exists + ''' + from etl import models + + value = value.strip().lower() + return value if value in models.VALID_CAKE_FLAVORS else None + + + def process_vegan(self, value): + ''' + Process and return desired vegan value + + Args: + value: the vegan value + + Returns: + True or False if vegan value exists + ''' + + value = value.strip().lower() + if value in ['t','true', 'y', 'yes']: + return True + elif value in ['f', 'false', 'n', 'no']: + return False + elif is_number(value): + return bool(float(value)) + return None \ No newline at end of file diff --git a/etl/utils.py b/etl/utils.py new file mode 100644 index 0000000..35617f3 --- /dev/null +++ b/etl/utils.py @@ -0,0 +1,34 @@ +from itertools import groupby + + +def split_text(s): + '''split str with number and yield result''' + + for k, g in groupby(s, str.isalpha): + yield ''.join(g) + + +def get_base_unit(unit): + '''Resolves the unit to one''' + + return { + 'm':'m', 'metres': 'm', + 'mm': 'mm', 'millimeters': 'm', + 'in': 'in', 'inches': 'in' + }.get(unit) + + +def is_number(n): + '''Validates if a string is a number''' + + try: + float(n) + return True + except ValueError: + return False + + +def value_to_mm(value, unit): + '''Convert values to millimeters''' + + return {'in': 25.4, 'm': 1000}[unit] * value \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..6f561d6 --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +"""Run this script to launch the pipeline""" + +from etl.master import run_etl + +if __name__ == "__main__": + run_etl("cake_data.csv") \ No newline at end of file diff --git a/reports/reports.html b/reports/reports.html new file mode 100644 index 0000000..db8fdb9 --- /dev/null +++ b/reports/reports.html @@ -0,0 +1,882 @@ + + + + Cake Reports with Invalid Name or Vegan + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Cake Reports with Invalid Name or Vegan
Entry_idNameDiameter_in_mmVeganOriginal_unit
2strawberry349.25Nonein
3None480.0Falsemm
5None456.692Falsein
6butter251.95554Nonemm
7babka186.9Nonemm
8None283.5Falsemm
9chocolate293.04925Nonemm
10None451.4Nonemm
12None405.384Nonein
14None629.3567296Falsein
16None386.334Truein
17avocado492.28463Nonemm
19None2.43Nonemm
21biscuit514.0Nonemm
22None321.056Falsein
23None360000.0Nonem
25None160.22910804Truein
26None435.99012Nonemm
27None175.0Nonemm
30chiffon23.300681Nonemm
31None588.7Nonemm
32caramel518.0Nonemm
33None121.91999999999999Nonein
34None301.0Nonemm
35None12.58Falsemm
38vanilla342.8Nonemm
40butter363.21999999999997Nonein
41None382.0Falsemm
43None534.4Truemm
44None432.3Nonemm
45sponge537.21803Nonemm
46None425.45Falsein
47None554.54711036Nonein
49None246.888Nonein
51None223.0Falsemm
52None399.542Nonein
54cream558.53092Nonemm
57biscuit175.00599999999997Nonein
58None534900.0Falsem
60None556.2Truemm
61None0.65Nonemm
62None376.68199999999996Nonein
65None551.0Truemm
66None562.22131Nonemm
69None456.0Nonemm
72chiffon280.0Nonemm
73None437.0071Falsemm
75None32.004Falsein
76None476.5592678599999Truein
77caramel23.25Nonein
80rainbow12.81Nonemm
85None11.74Nonemm
86None32.22252128Nonein
88None471.678Falsein
90None369.5Nonemm
91None6.5824703Nonein
92None9.31Nonemm
94None429.11287Falsemm
95chocolate553.0Nonemm
97biscuit202.438Nonein
99apple575.0Nonemm
101carrot187.49519Nonemm
102None364.54837Nonemm
103caramel202.1Nonemm
105cream135.38526136000002Nonein
109rainbow329.0Nonemm
110chiffon563.626Nonein
111rainbow131.07657808Nonein
113None167.33247Nonemm
114None5.8846421Nonein
115None432.0Nonemm
117None447.80199999999996Nonein
121butter328.03988Nonemm
123None12.78Falsein
128black forest341.12199999999996Nonein
130None175.89768224Nonein
132None313.57076478Falsein
134None23.28Nonein
135None456.1Truemm
136apple573.7913593999999Nonein
137None251.0Truemm
138None119.63399999999999Falsein
139None24.09Falsein
144None18.7984662Nonein
146biscuit599.78417Nonemm
148None166.941Falsemm
149None502.66599999999994Nonein
151None273.1Falsemm
152None21.9Truein
153None421.1Nonemm
154butter88.646Nonein
156None227000.0Falsem
158None24.3589237Falsein
159None72.39Truein
161None35.328749Nonemm
162sponge496531.85000000003Nonem
163None528.96386Falsemm
164None12.8Truemm
166avocado81.23653297999999Nonein
167biscuit189600.0Nonem
168None266.0Falsemm
170None603.25Falsein
174None621.0364541399999Nonein
175carrot351000.0Nonem
176apple363.474Nonein
178None214.61343Truemm
179None265.0Nonemm
182None214000.0Nonem
183None21.33Nonemm
185cream316.992Nonein
186None230330.24Nonem
187None196.3Falsemm
188None354.584Falsein
190None381.17572736Truein
191None275.54087Truemm
193None543.77495258Nonein
194None488.9Falsemm
195None425.8Falsemm
198None56.640123Nonemm
199None335.68962Falsemm
201biscuit402.0Nonemm
203vanilla4.7152478Nonemm
204biscuit504.0Nonemm
205None54.944515460000005Nonein
206biscuit353.06Nonein
207None292.0Falsemm
208None233.79166602Nonein
214None301.7Truemm
215None68.05299826Nonein
216None301.94272352Falsein
217None71.88199999999999Falsein
218black forest267.71599999999995Nonein
219None505.0Truemm
224None374.142Nonein
225None86.868Falsein
229None545.592Falsein
232babka554.2Nonem
234None440.2Nonemm
236avocado555184.74Nonem
237None308.1Falsemm
238None249.6Nonemm
239None162.3Falsemm
241None596.9Nonein
+
+ + +
SummaryTimestampStatus
Cake reports07-03-2022, 12:20Success
+ +
+ + + \ No newline at end of file diff --git a/reports/test_reports.html b/reports/test_reports.html new file mode 100644 index 0000000..5985f23 --- /dev/null +++ b/reports/test_reports.html @@ -0,0 +1,36 @@ + + + + Test Reports (created from unit test) + + +
+ + + + + + + + + + + + + + + + + + + +
Test Reports (created from unit test)
Entry_idNameDiameter_in_mmVeganOriginal_unit
58None534900.0Falsem
60None556.2Truemm
+
+ + +
SummaryTimestampStatus
Cake reports07-03-2022, 12:19Success
+ +
+ + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..23a012f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +mongoengine==0.24.0 +mongomock==4.0.0 +pydantic==1.9.0 +pymongo==4.0 +typing_extensions==4.1.1 \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/__pycache__/__init__.cpython-38.pyc b/tests/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9c5e2cbd4dbcd565acdd17856f8569db8cc34b4 GIT binary patch literal 137 zcmWIL<>g`kg7ZP*NsK`HF^Gcv6DM`rf0%qw@FhgPUId>gyB3;f!opY`pBY#u$j(X%Uw-FsY+8F^b)*#5A4yILMT-rFtrB!Y((}6T) eyzQnLLUVtI)%}ce)HZn2R`u4+ui+29kp2V8^WumA literal 0 HcmV?d00001 diff --git a/tests/__pycache__/test_loader.cpython-38.pyc b/tests/__pycache__/test_loader.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7a951a79b67e7dacfe1d0b31d31ee25634cef4b GIT binary patch literal 1548 zcmZWpJ98sN5T2QxeQ34P-N6~SDyULIu!(gLQWSNR4@f`+oRdkm);nW+<$ai$F~%x6 zvQt;_6Xfu6{s9pw;iN?V2Najky{q%dVpQAP{mpF8*DvjlMlpePIR5nD51){~(b!xZ zj2G~jmmnmOv?4VXl%mcmCOmk16|a5acNnjNIuv0YiHMT#i1cM}O=O_xl_v(0{Y296 z4m^>Ry6BZMb}u(-21f9Gn@gor=90bqME%ebNx)V&&bNIaVtd7H!k0 z!oH@?D=PDv=mcE<``L8gC~c;{PG7d?O$E6z)8aUn&N)Te6Oc&Qhi$Kt(J|v!<)DYdWgE7=h_V zR^=M`b|l^h=q-tlHb^`;^neHSAHj<{9)E#iq2DmSSV9@x_+UfvffgJX(DGwSm&wNng3id8<#%>+mzeCt% z4Al^Rp;cZBR@dLAzKc|2^+i@nHvf9YnF~R7D3}^i2*XqWNvefeJ{fH zU=E#XoL{tOzy`l?{$Z(&bwlVn2rk~o>N6_b1PzS!o>%i9y z?{fB-bllgVjdZAXH-swJBPDtrrd#XLzWBMwcZy$RrW+Nk*IgGKu{RpZLYLANety?)R18{&pdK z9+i>*G)fa#Bt>psjVYXWep#@n!MzT*8UW!Gk4Nu$?N8^JmSmd@=_`87hUA0}DX-6y zA=_i8j1rWT7RcOr@|=RyzK}W~ZH%O~k+cpn&%Eu9Bn^1SJHza=tKs-7(Wu_$3?FCD2S9Vo*js|6kM|`d*iIEEOwW*wfxmeZ{rMtR>)*0$)hyv z=SivCJc%+<3K=GOm}S}(c`1*=gljv82ikcmcA{Kj6-g){B3npifOMJ?t4B8@W7>C5 z`cZ23X=ghvqS9>FRs#C)9o?$-{iJ$DE{q24RiX{q7qX2iEp022ZU?iajB-`E7@~`T zhv^GVI?sr_0h`|}Eo?$e)WWlcN5x?dM{>0g??!1V@*S~ILLDeWtG875;7DIbsZkh? zbC1rLQX@8_qu;-^y2s||P58~x!JSKm*1bwe59-kkLyTD%YA?h?`zfwlVThp_J*kHw zFXAwi@58Caj|p-UWeyM*O~UB9)V7V}ttP?xT*GbO0GVtOVOMSPoCXf8O}E0jR`qF6 z=PqwR6S>^O%F?rA->S|WERFlfbSs22$;G&puyK8G0?I8f-)&qb9 zoV2pnsa3o0gH+d&90xpvyXeEj#46VoOfm>LR&>xX#+y<2o>@-Swu7~;bTuX`9aGVo zFLCx=6@!Op)@MM7Gai7pJF))9o^8`MZP6AR-2eZ1C& literal 0 HcmV?d00001 diff --git a/tests/__pycache__/test_reports.cpython-38.pyc b/tests/__pycache__/test_reports.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..813c5b7ae560d94a04101a2782316e33723eeb39 GIT binary patch literal 1161 zcmZ`&OK%e~5VqI*B2A!Bka$U~go@ZpB5^{fLaoZgH(9igV7o1i(gSL* z`~VP#9+$K=sH6d&ZeKx+vH(C)8ZbZ#a zz3x-+^oLLnfH#5l0}uiNplfvd8Jxl@5TUq)tAR5i9BPJIT3hE&;PwEReA|vTURYYh zo(*Mb)G^P5b<4Dn_Oi&}XQbj;$&12zhjNgXwo|EWkdM>pRvd z3Is`{CdI#&`TiH8uEM1sk9xaWD&2e6dsZEld6f#?>krdBm*qhA3{hh&cB~-xS)R@FaN`?TN_<~3WvZ0uA#z|1wEmLOvoE5oCy`|13Pgh z&eWL^N(`IOQ#y5LkxQl&BppN;v+DNY m`6>qYlO<{L?z~?Oxae;oMjf&SBwT9}eTvVoj@E{f+w3=$VkZRv literal 0 HcmV?d00001 diff --git a/tests/__pycache__/test_transformer.cpython-38.pyc b/tests/__pycache__/test_transformer.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5a272a77b2b731840a54c2d5ea0a702f0e46621 GIT binary patch literal 3532 zcmd5<&5zqe6!+NqaK1KeDW%ItI|YOqX-O97ejv4Fp%f^pA|$pPd~w~(WY@*9mlaA47li_K6{w(umP9+1_U=8!VJe`#IcyktVage1m|ai4J5I9=}`^IB)-@I?J$`g|q=@Yn~ zc`CAM`c#6M$Lh3U@&v>5hnYxClmyd^9eu zthU$Il{pxwa?caqfCtL#Mh8VzoZjdVPUl}S>N?T7hFvzf8_;|tD;_4H6r0}`JoqR?+z?gkUZ!+wdp!sC`)j8V2HQC zfv2mTI}j(?dC>VX+6#loW3toR_JV+iTf7qk&QeP^W1zh=QWsE_ed+FcfzR}+;9CuB z%L^b3%d||Ap8pn}na@hKy178`tdntjZhy63Z1#V(#Xd~8*aQ{ohct?}xuCc)F*(u0n|CiScW3o%cDb$@taRvpb3;`L( zP}O9+Y>FWl=kV;U&ZeMy-wfw+t_y0^_kioBypB0nCX z+7~z#i0|DCxFVlCEo;UDET$g{OjN4aM;>V~6uB}!+xLS2(g%JV_9Gg2Kzvn*G1Q1v zw`FMxYj9LE0phkVW6j-M3@&O6j<}*(@1~Bffm!)3h&j^O83O$S1Uh4z<4!Id$J>4m zl~z)MV3)Hc%69=kjiMiG>_UKnQ>=XS?#k+5aF>dU5F~LKMF|D|7=in2 zo=Zi1w*aCmnmEj(#-fI0;6LDOn&B~<4f%KQ$w|1;$^|3O;_)!thqdO(aiSVofI3*r z)WOoULY6^eXEc=~6>=#BAqs;L)ybRcQZ|HYfaQjIf&(h?P=uT*vg(+t%R1WQ=B@C^ z{M^VRE{9h^bX(OKQD#|8)f39xlHDlevz#Vghi_hz%uZ{>l7f?z-#&cwLZs?Ry}6W& zRMyh*2)%+X=30u8?qYO-M?z?KAsqo%{1gcY;gmv>LczhHDzfI7YwI(Lj2jc=e!wPW zo-D|s!>)!bX-|$VJmSjBK{}NINxQ!PGK|umKzEm$vz9IC2@;I7p;r!3P1mD3G@6ZR zqsa$5P--UiP`3b-@xC_Zzl|KelgE6*lpF6?Yfu`Tvt_HcM!2*faAKW%T}r>yDp1*t}EU_pFcp+MzM@y z1;wW*K0`6So>g4}-gKssI20 literal 0 HcmV?d00001 diff --git a/tests/test_extractor.py b/tests/test_extractor.py new file mode 100644 index 0000000..5cc65b2 --- /dev/null +++ b/tests/test_extractor.py @@ -0,0 +1,21 @@ +from unittest.case import TestCase + +from etl.extractor import Extractor + + +class TestExtractor(TestCase): + """ + Test Extractor + """ + + def test_extractor(self): + '''Assert data is extracted properly''' + + file_path = './cake_data.csv' + extractor = Extractor(in_file_path=file_path) + data = extractor.extract_data() + + self.assertIsInstance(data, list) + self.assertIsInstance(data[0], dict) + self.assertEqual(len(data[0]), 5) + self.assertIsNotNone(data[0].get('entry')) diff --git a/tests/test_loader.py b/tests/test_loader.py new file mode 100644 index 0000000..a49bd75 --- /dev/null +++ b/tests/test_loader.py @@ -0,0 +1,55 @@ +from unittest import TestCase + +import mongoengine as me + +from etl.loader import Loader +from etl.models import CakeMongoOrm, CakeModel + + + +class TestLoader(TestCase): + """ + Test Loader + """ + + @classmethod + def setUpClass(cls): + me.connect('caketest', host='mongomock://localhost') + + @classmethod + def tearDownClass(cls): + me.disconnect() + + def test_load_data(self): + '''Assert loader works properly''' + + cake_data = [ + CakeModel( + entry_id=180, + diameter_in_mm=522, + name='cream', + original_unit='mm', + vegan=False + ), + CakeModel( + entry_id=201, + diameter_in_mm=400, + name='strawberry', + original_unit='mm', + vegan=True + ) + ] + + loader = Loader(cake_data, test_mode=True) + loader.load_data() + cake_count = CakeMongoOrm.objects().count() + + cake = CakeMongoOrm.objects(entry_id=cake_data[0].entry_id).first() + + self.assertEqual(len(cake_data), cake_count) + + self.assertEqual(cake_data[0].original_unit, cake.original_unit) + self.assertEqual(cake_data[0].diameter_in_mm, cake.diameter_in_mm) + self.assertEqual(cake_data[0].entry_id, cake.entry_id) + self.assertEqual(cake_data[0].name, cake.name) + self.assertEqual(cake_data[0].vegan, cake.vegan) \ No newline at end of file diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..eaa8d4b --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,89 @@ +from unittest import TestCase + +import mongoengine as me + +from etl.models import CakeMongoOrm, CakeModel + + +class TestCakeModel(TestCase): + """ + Test Pydantic model of a cake used for data validation + """ + + def test_data_validation(self): + '''Assert that well formed data is consumed as expected''' + + transformed_data = { + 'original_unit': 'mm', + 'diameter_in_mm': '440.2', + 'entry_id': '234', + 'name': None, + 'vegan': True + } + cake_model = CakeModel(**transformed_data) + + self.assertEqual(transformed_data['original_unit'], cake_model.original_unit) + self.assertEqual(float(transformed_data['diameter_in_mm']), cake_model.diameter_in_mm) + self.assertEqual(int(transformed_data['entry_id']), cake_model.entry_id) + self.assertIsNone(transformed_data['name'], cake_model.name) + self.assertTrue(cake_model.vegan) + + +class TestCakeMongoOrm(TestCase): + """ + Test Mongoengine model of Cake document + """ + + @classmethod + def setUpClass(cls): + me.connect('caketest', host='mongomock://localhost') + + @classmethod + def tearDownClass(cls): + me.disconnect() + + def test_object_creation(self): + '''Assert data is created properply''' + + data = { + 'original_unit': 'mm', + 'diameter_in_mm': '440.2', + 'entry_id': '234', + 'name': 'strawberry', + 'vegan': True + } + validated_data = CakeModel(**data) + CakeMongoOrm(**validated_data.dict()).save() + cake = CakeMongoOrm.objects(entry_id=234).first() + + self.assertEqual(validated_data.original_unit, cake.original_unit) + self.assertEqual(validated_data.diameter_in_mm, cake.diameter_in_mm) + self.assertEqual(validated_data.entry_id, cake.entry_id) + self.assertEqual(validated_data.name, cake.name) + self.assertEqual(validated_data.vegan, cake.vegan) + + def test_bulk_object_creation(self): + '''Assert bulk data creation works properly''' + + bulk_data = [ + { + 'diameter_in_mm': '514.2', + 'entry_id': '200', + 'name': 'cream', + 'original_unit': 'mm', + 'vegan': False + }, + { + 'diameter_in_mm': '402', + 'entry_id': '201', + 'name': 'strawberry', + 'original_unit': 'mm', + 'vegan': True + }, + ] + bulk_validated_data = [CakeModel(**data).dict() for data in bulk_data] + cakes = [CakeMongoOrm(**data) for data in bulk_validated_data] + CakeMongoOrm.objects.insert(cakes) + cake_count = CakeMongoOrm.objects().count() + + self.assertEqual(len(bulk_validated_data), cake_count) \ No newline at end of file diff --git a/tests/test_reports.py b/tests/test_reports.py new file mode 100644 index 0000000..9d3ca89 --- /dev/null +++ b/tests/test_reports.py @@ -0,0 +1,43 @@ +from pathlib import Path +from unittest.case import TestCase + +from etl.reports import Report + + +class TestReport(TestCase): + """ + Test Report + """ + + def setUp(self): + + self.data = [ + { + 'entry_id': 58, + 'name': None, + 'diameter_in_mm': 534900.0, + 'vegan': False, + 'original_unit': 'm' + }, + { + 'entry_id': 60, + 'name': None, + 'diameter_in_mm': 556.2, + 'vegan': True, + 'original_unit': 'mm' + } + ] + + def test_report_is_generated(self): + '''Assert that report are generated''' + + path = './reports/test_reports.html' + report = Report( + data=self.data, + caption='Test Reports (created from unit test)', + path=path + ) + report.create_report() + new_file = Path(path).resolve() + + self.assertEqual(new_file.is_file(), True) \ No newline at end of file diff --git a/tests/test_transformer.py b/tests/test_transformer.py new file mode 100644 index 0000000..92a40ea --- /dev/null +++ b/tests/test_transformer.py @@ -0,0 +1,166 @@ +from unittest.case import TestCase + +from etl.transformer import Transformer + + +class TestTransformer(TestCase): + """ + Test Transformer + """ + + def test_transformer_valid_unit_mm(self): + '''Assert that transformer converts properly''' + + transformer = Transformer( + raw_data=[ + { + "entry": "1", + "cake_diameter": "56.78", + "diam_unit": "mm", + "flavor": "caramel", + "is_cake_vegan": "No", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 1) + self.assertEqual(res.name, "caramel") + self.assertEqual(res.diameter_in_mm, 56.78) + self.assertFalse(res.vegan) + self.assertEqual(res.original_unit, "mm") + + def test_diameter_conversion(self): + '''Assert diameter in other units converts to mm''' + + transformer = Transformer( + raw_data=[ + { + "entry": "2", + "cake_diameter": "5", + "diam_unit": "m", + "flavor": "strawberry", + "is_cake_vegan": "yes", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 2) + self.assertEqual(res.name, "strawberry") + self.assertEqual(res.diameter_in_mm, 5000) + self.assertTrue(res.vegan) + self.assertEqual(res.original_unit, "m") + + def test_irrecoverable_data_quality(self): + '''Assert that record is discarded when data quality is irrecoverable''' + + transformer = Transformer( + raw_data=[ + { + "entry": "3", + "cake_diameter": "56.78mm", + "diam_unit": "in", + "flavor": "caramel", + "is_cake_vegan": "false", + }, + { + "entry": "4", + "cake_diameter": "fill this info later", + "diam_unit": "in", + "flavor": "caramel", + "is_cake_vegan": "true", + } + ] + ) + res = transformer.transform_data() + + self.assertListEqual(res, []) + + def test_mixed_diameter_value(self): + '''Assert diameter is resolved, even when it is in the form `56mm` ''' + + transformer = Transformer( + raw_data=[ + { + "entry": "5", + "cake_diameter": "56.78mm", + "diam_unit": "mm", + "flavor": "caramel", + "is_cake_vegan": "No", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 5) + self.assertEqual(res.name, "caramel") + self.assertEqual(res.diameter_in_mm, 56.78) + self.assertFalse(res.vegan) + self.assertEqual(res.original_unit, "mm") + + def test_valid_flavor(self): + '''Assert only valid flavours/name are returned''' + + transformer = Transformer( + raw_data=[ + { + "entry": "6", + "cake_diameter": "60", + "diam_unit": "mm", + "flavor": "Invalid flavour", + "is_cake_vegan": "No", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 6) + self.assertIsNone(res.name) + self.assertEqual(res.diameter_in_mm, 60) + self.assertFalse(res.vegan) + self.assertEqual(res.original_unit, "mm") + + def test_valid_vegan(self): + '''Assert that vegan is validated, invalid ones resolves to None''' + + transformer = Transformer( + raw_data=[ + { + "entry": "7", + "cake_diameter": "78", + "diam_unit": "mm", + "flavor": "caramel", + "is_cake_vegan": "Invalid Vegan", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 7) + self.assertEqual(res.name, 'caramel') + self.assertEqual(res.diameter_in_mm, 78) + self.assertIsNone(res.vegan) + self.assertEqual(res.original_unit, "mm") + + def test_valid_diameter_unit(self): + '''Assert empty diameter unit defaults to mm''' + + transformer = Transformer( + raw_data=[ + { + "entry": "8", + "cake_diameter": "80", + "diam_unit": "", + "flavor": "caramel", + "is_cake_vegan": "y", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 8) + self.assertEqual(res.name, 'caramel') + self.assertEqual(res.diameter_in_mm, 80) + self.assertTrue(res.vegan) + self.assertEqual(res.original_unit, "mm") \ No newline at end of file