diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..74fc874 Binary files /dev/null and b/.DS_Store differ diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..6bd93e6 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "env/bin/python" +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..ea6f3a2 --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +# Overview + +- Extracts, Transforms and Loads data into Mongo DB +- Mongo express (viewer app) available (Docker required) +- Additional field, `original_report` +- Generated Data Reports +- Test cases + +# Running + +1. To spin up MongoDB and Mongo Express (for viewing the database), run +`docker-compose up` N/B: Must have docker installed + +2. Goto `http://127.0.0.1:8081` to see the visual database + +3. To run the etl app, run `python3 main.py` + + +# Testing + To run tests, use: ` python3 -m unittest discover` + diff --git a/cake_data.csv b/cake_data.csv new file mode 100644 index 0000000..61a6446 --- /dev/null +++ b/cake_data.csv @@ -0,0 +1,242 @@ +entry|cake_diameter|diam_unit|flavor|is_cake_vegan +1|21.8830537inches||RED|FALSE +2|13.75| in|Strawberry|Strawberry +3|480|||FALSE +4|9.8690561|inches|BUTTER|NO +5|17.98|in |Avocado Cake|0 +6|251.95554|mm|butter| +7|186.9 mm|mm|Babka| +8|283.5 mm||Chiffon Cake|FALSE +9|293.04925 mm|MM|chocolate|N/A +10|451.4 MM|mm|chokolade|N/A +11|418.4|mm|avocado|yes +12|15.96|inches|Chiffon Cake| +13|24.04in|inches|Strawberry|no +14|24.777824|inches|Caramel Cake|FALSE +15|231.3mm||BISCUIT|NO +16|15.21|in|BLACKFOREST|Y +17|492.28463 mm|mm|avocado|N/A +18|7.9989575|inches|VANILLA|FALSE +19|"2.43"""|""""|Chiffon Cake| +20|16.7502938|inches|rainbow|t +21|514|mm|biscuit|N/A +22|12.64|inches||0 +23|360|millimeters|Strawbery| +24|"5.81"""|""""|Chiffon|FALSE +25|6.3082326|in|BLACKFOREST|6 +26|435.99012||APFEL| +27|175 mm|mm|N/A| +28|398.01697 millimeters|mm|butter| +29|211.37809|mm|Avocado|Y +30|23.300681||Chiffon|blue +31|588.7|mm|Strawbery| +32|518mm|mm|CARAMEL| +33|4.8|inches|chokolade| +34|301||baunilha|N/A +35|12.58||N/A|NO +36|13.77 in|inches|vanilla|NO +37|171.53843mm|mm|butter|FALSE +38|342.8|mm|VANILLA| +39|513.8 millimeters|mm|N/A|no +40|14.3|in|BUTTER| +41|382.0|mm|vanilla cake|no +42|196 mm|inches|apple pie| +43|534.4mm||APFEL|YES +44|432.3|mm|Avocado Cake| +45|537.21803|mm|sponge| +46|16.75|inches|butter cake|NO +47|21.8325634|inches|apple pie|partially +48|196.82931 millimeters|mm||FALSE +49|9.72|inches|chokolade| +50|10.66 inches||BUTTER|TRUE +51|223||baunilha|FALSE +52|15.73|inches|| +53|6.21|inches|apple|FALSE +54|558.53092 mm|mm|CREAM| +55|473.5mm|inches|vanilla| +56|163mm|millimeters|Strawberry| +57|6.89|inches|biscuit|N/A +58|534.9|millimeters|RED|0 +59|581|mm|Avocado|FALSE +60|556.2mm||Chiffon Cake|1 +61|0.65||vanilla cake| +62|14.83|inches|Caramel Cake| +63|314.04175|mm|Caramel|0 +64|394 MM|inches|sponge|6 +65|551||choc|YES +66|562.22131 mm||apple pie|N/A +67|248|mm|Caramel|f +68|9.9470194 inches|inches|cream|0 +69|456|mm|b. forest| +70|8.68881|inches|biscuit |FALSE +71|392|mm|CREAM|yes +72|280||Chiffon| +73|437.0071 mm||RED|N +74|230|mm|carrot|yes +75|1.26in|inches||FALSE +76|18.7621759|inches|BLACKFOREST|YES +77|"23.25"""|in|caramel| +78|16.59|inches|vanilla|-1 +79|10.31 inches|mm|butter cake|YES +80|12.81||rainbow| +81|12.8||sponge|no +82|466.9|mm|biscuit |no +83|"19.8874942"""||APPLE|NO +84|13.33in||carrot| +85|11.74||vanilla cake|N/A +86|1.2686032|inches|Strawbery| +87|11.75 inch||strawbery|TRUE +88|18.57|inches||N +89|7.45 inch||b. forest|6 +90|369.5mm|||N/A +91|"6.5824703"""|in|Caramel Cake| +92|9.31||b. forest|N/A +93|test|test|Strawbery|Not sure +94|429.11287||apple pie|no +95|553|mm|chocolate|N/A +96|298.01697 millimeters|mm|butter|not in the slightest!! +97|7.97inches|inches|biscuit| +98|268||CREAM|no +99|575|MM|APPLE| +100|6.19inch|inches||6 +101|187.49519||carrot| +102|364.54837|mm|Caramel Cake| +103|202.1mm||caramel| +104|166.2| |BISCUIT|NO +105|5.3301284|inches|CREAM| +106|16.36|inches|chocolate|FALSE +107|211.9 millimeters|mm|APPLE| +108|10.2inch|inches|apple| +109|329|mm|rainbow|N/A +110|22.19|inches |Chiffon| +111|5.1604952|inches|rainbow| +112|18.92762in|18.92762in|18.92762in|18.92762in +113|167.33247|mm|14| +114|"5.8846421 """|in|strawbery| +115|432||SWBERRY| +116|432||apple|0 +117|17.63|in|| +118|3.8251793|inches|caramel|f +119|24.6356199|inches|Avocado|Y +120|356.20528||biscuit|NO +121|328.03988mm||butter|N/A +122|17.64 inch|inches|chocolate| +123|"12.78 """|inches||FALSE +124|22.21 inch|inches|CREAM|NO +125|295.56098 millimeters||BLACKFOREST|6 +126|10.8 inches|mm||no +127|429 mm|mm|CREAM|0 +128|13.43|in|black forest| +129|fill this info later|inches||FALSE +130|6.9251056|inches|strawbery| +131|266.2|mm|black forest|0.1 +132|12.3453057|inches|SWBERRY|NO +133|16.34 in|||NO +134|"23.28"""|inches|| +135|456.1|mm|BUTTER_CAKE|yes +136|22.590211|inches|APPLE| +137|251|mm|Avocado Cake|Y +138|4.71|inches|Caramel Cake|FALSE +139|"24.09"""|inches||f +140|218.24368|mm|vanilla|N +141|0.2m|m|carrot|yes +142|22.93m|inches|avocado| +143|19.19|inches|black forest|no +144|"18.7984662"""|in|SWBERRY| +145|266|millimeters|Caramel|FALSE +146|599.78417|MM|BISCUIT| +147|42.11287|cake|apple pie|no +148|166.941|mm|apple pie|FALSE +149|19.79|inches|RED| +150|22.69|in|BISCUIT|1 +151|273.1|mm|BUTTER_CAKE|f +152|"21.9 """|inches|Avocado Cake|6 +153|421.1|mm|Strawbery| +154|3.49|inches|butter| +155|very large||butter| +156|227|millimeters|SWBERRY|N +157|534.6mm|mm|biscuit |Y +158|"24.3589237"""|inches|choc|FALSE +159|2.85 in|inches|BLACKFOREST|TRUE +160|5.16 inch|inches||NO +161|35.328749||vanilla cake| +162|496.53185|millimeters|sponge| +163|528.96386 MM||Chiffon Cake|FALSE +164|"12.8"""||Caramel Cake|t +165|174.55053|mm|apple|Y +166|3.1982887 inches|inches|avocado| +167|189.6|millimeters|BISCUIT| +168|266|MM|chokolade|no +169|8.2284843 inches|inches|black forest|NO +170|23.75|inches|Chiffon Cake|0 +171|591.00988 millimeters|MM|BLACKFOREST|0 +172|17.5759116inch|inches|vanilla|f +173|428.31116 millimeters|inches|BISCUIT| +174|24.4502541|inches|| +175|351|millimeters|carrot|N/A +176|14.31|inches|APPLE|N/A +177|14.153215|in|butter|t +178|214.61343mm|MM|Avocado Cake|yes +179|265||BUTTER_CAKE|N/A +180|11.23inches|mm |biscuit |0 +181|371|mm|BUTTER|FALSE +182|214|millimeters|Strawbery| +183|21.33||baunilha| +184|260 millimeters|mm|rainbow| +185|12.48|inches|cream|N/A +186|230.33024|millimeters|N/A| +187|196.3||BUTTER_CAKE|0 +188|13.96inches|inches||no +189|2|average human head||no +190|15.0069184|inches |Avocado Cake|6 +191|275.54087||chokolade|14 +192|571|mm|Chiffon|no +193|21.4084627|inches|Chiffon Cake| +194|488.9 MM|mm|BLACKFOREST|NO +195|425.8|mm|apple pie|0 +196|319.5|mm|BUTTER|0 +197|0.1719|meters|RED| +198|56.640123||baunilha| +199|335.68962 MM||SWBERRY|FALSE +200|514.2||chocolate|NO +201|402|mm|biscuit| +202|313|millimeters| biscuit |0 +203|4.7152478||VANILLA | +204|504||biscuit | +205|2.1631699|in|Strawbery| +206|13.9in|inches|biscuit | +207|292|mm|APFEL |FALSE +208|9.2043963|in|strawbery| +209|22.24 in|""""|carrot|0 +210|250.04218||carrot|0 +211|18.0199549|inches|Chiffon|NO +212|16.62|inches|avocado|FALSE +213|14.07|""""|rainbow|Y +214|301.7|mm|SWBERRY|TRUE +215|2.6792519inches|inches|strawbery| +216|11.8875088|inches|b. forest|no +217|2.83|inches|chokolade|no +218|10.54|in|black forest| +219|505||choc|6 +220|519385039|millimeters|carrot|NO +221|518.40178|mm|Avocado|f +222|23.8636638inches|inches|apple|FALSE +223|21.9175111|inches|CREAM|TRUE +224|14.73|inches|| +225|3.42|inches|RED|f +226|2.57inches|mm||inches +227|219mm|millimeters|sponge| +228|22.9384568|inches|BUTTER|NO +229|21.48in|inches|Avocado Cake|NO +230|9.05 inch| inches||no +231|198 millimeters|inches|biscuit |1 +232|0.5542|m|Babka| +233|588.6 millimeters|mm|carrot|f +234|440.2|mm|APFEL| +235|13.11|inches |caramel|yes +236|555.18474|millimeters|avocado| +237|308.1 MM|mm|APFEL|NO +238|249.6|mm|Chiffon Cake|N/A +239|162.3 mm|MM|Avocado Cake|NO +240|351.79509|mm|butter|FALSE +241|23.5|inches|BLACKFOREST| diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..96716b1 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,18 @@ +version: "3.8" + +services: + + mongo: + image: mongo + restart: always + ports: + - 27017:27017 + + mongo-express: + image: mongo-express + restart: always + ports: + - 8081:8081 + environment: + ME_CONFIG_MONGODB_URL: mongodb://mongo:27017/ + diff --git a/etl/__init__.py b/etl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/etl/__pycache__/__init__.cpython-38.pyc b/etl/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..f7da27c Binary files /dev/null and b/etl/__pycache__/__init__.cpython-38.pyc differ diff --git a/etl/__pycache__/extractor.cpython-38.pyc b/etl/__pycache__/extractor.cpython-38.pyc new file mode 100644 index 0000000..5255c8d Binary files /dev/null and b/etl/__pycache__/extractor.cpython-38.pyc differ diff --git a/etl/__pycache__/loader.cpython-38.pyc b/etl/__pycache__/loader.cpython-38.pyc new file mode 100644 index 0000000..8ee50c1 Binary files /dev/null and b/etl/__pycache__/loader.cpython-38.pyc differ diff --git a/etl/__pycache__/master.cpython-38.pyc b/etl/__pycache__/master.cpython-38.pyc new file mode 100644 index 0000000..a2d9a24 Binary files /dev/null and b/etl/__pycache__/master.cpython-38.pyc differ diff --git a/etl/__pycache__/models.cpython-38.pyc b/etl/__pycache__/models.cpython-38.pyc new file mode 100644 index 0000000..e546c80 Binary files /dev/null and b/etl/__pycache__/models.cpython-38.pyc differ diff --git a/etl/__pycache__/reports.cpython-38.pyc b/etl/__pycache__/reports.cpython-38.pyc new file mode 100644 index 0000000..a63f447 Binary files /dev/null and b/etl/__pycache__/reports.cpython-38.pyc differ diff --git a/etl/__pycache__/transformer.cpython-38.pyc b/etl/__pycache__/transformer.cpython-38.pyc new file mode 100644 index 0000000..9726f6b Binary files /dev/null and b/etl/__pycache__/transformer.cpython-38.pyc differ diff --git a/etl/__pycache__/utils.cpython-38.pyc b/etl/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000..20f1a83 Binary files /dev/null and b/etl/__pycache__/utils.cpython-38.pyc differ diff --git a/etl/extractor.py b/etl/extractor.py new file mode 100644 index 0000000..ede35e0 --- /dev/null +++ b/etl/extractor.py @@ -0,0 +1,27 @@ +import csv + +from typing import List + + +class Extractor: + def __init__(self, in_file_path: str): + """ + This class extracts data from source file + + Args: + in_file_path: path to the source file + """ + self.in_file_path = in_file_path + + def extract_data(self) -> List[dict]: + """ + Extracts data from CSV file + + Returns: + data as a list of dictionaries + """ + + with open(self.in_file_path, "r") as csvfile: + reader = csv.DictReader(csvfile, delimiter='|') + next(reader) + return list(reader) \ No newline at end of file diff --git a/etl/loader.py b/etl/loader.py new file mode 100644 index 0000000..6ac19cd --- /dev/null +++ b/etl/loader.py @@ -0,0 +1,43 @@ +from typing import List + +import mongoengine as me + +from .models import CakeModel, CakeMongoOrm + + +def connect(): + """ + Connects to the database + """ + me.connect("cakes") + + +class Loader: + def __init__(self, cake_data: List[CakeModel], test_mode: bool = False): + """ + This class loads transformed data into the database + + Args: + cake_data: transformed data + test_mode: live mode or unit testing mode + """ + + if not test_mode: + connect() + + self.cake_data = cake_data + + def load_data(self): + """ + Inserts data into the database + """ + + print("Preparing data...") + cakes = [CakeMongoOrm(**data.dict()) for data in self.cake_data] + + CakeMongoOrm.objects.delete() + + print("Inserting data into the database... please wait") + CakeMongoOrm.objects.insert(cakes) + + print("Data loaded into the database successfully!") diff --git a/etl/master.py b/etl/master.py new file mode 100644 index 0000000..fe9c269 --- /dev/null +++ b/etl/master.py @@ -0,0 +1,22 @@ +from .extractor import Extractor +from .loader import Loader +from .transformer import Transformer +from .reports import Report + + +def run_etl(input_file: str): + """ + Runs whole ETL pipeline + + Args: + input_file: path to the source file + """ + extractor = Extractor(input_file) + transformer = Transformer(extractor.extract_data()) + loader = Loader(transformer.transform_data()) + + loader.load_data() + + # create reports + report = Report() + report.create_report() diff --git a/etl/models.py b/etl/models.py new file mode 100644 index 0000000..351addb --- /dev/null +++ b/etl/models.py @@ -0,0 +1,48 @@ +from typing import Optional + +import mongoengine as me +from pydantic import BaseModel, Field + +VALID_CAKE_FLAVORS = [ + "butter", + "carrot", + "black forest", + "avocado", + "vanilla", + "caramel", + "rainbow", + "chiffon", + "cream", + "babka", + "sponge", + "apple", + "strawberry", + "biscuit", + "chocolate", +] + +VALID_UNITS = ["mm", "in", "m"] + + +class CakeMongoOrm(me.Document): + """ + Mongoengine model of Cake document + """ + + entry_id = me.IntField(required=True, unique=True) + name = me.StringField(null=True, choices=VALID_CAKE_FLAVORS) + diameter_in_mm = me.FloatField(required=True) + vegan = me.BooleanField(null=True) + original_unit = me.StringField(choices=VALID_UNITS, required=True) + + +class CakeModel(BaseModel): + """ + Pydantic model of a cake for data validation + """ + + entry_id: int = Field(description="The entry id of the cake") + name: Optional[str] = Field(description="Name (or type) of the cake", default=None) + diameter_in_mm: float = Field(description="Diameter of the cake in millimeters") + vegan: Optional[bool] = Field(description="Specifies if cake is vegan or not", default=None) + original_unit: str = Field(description="The original unit of cake's diameter") \ No newline at end of file diff --git a/etl/reports.py b/etl/reports.py new file mode 100644 index 0000000..37cb4b4 --- /dev/null +++ b/etl/reports.py @@ -0,0 +1,122 @@ +from datetime import datetime +from typing import List + +from mongoengine.queryset.visitor import Q + +from .loader import connect +from .models import CakeMongoOrm + + +class Report: + def __init__( + self, data: List[dict] = [], + caption: str = 'Cake Reports with Invalid Name or Vegan', + bg_color: str = '#FADBD8', + path: str = None): + ''' + This class creates reports + + Args: + data: a list of dictionaries + caption: a string to caption report + bg_color: string to give color to report + path: the path to write report to + ''' + self.data = data if data else self.get_data_from_mongo() + self.caption = caption + self.bg_color = bg_color + self.path = path if path else './reports/reports.html' + + def write_to_file(self, content: str): + ''' + Writes the html string to a html file + ''' + + # Save the HTML code + file_obj = open(self.path, 'w') + file_obj.write(content) + file_obj.close() + + + def create_html_table(self) -> str: + ''' + Creates table data for reports + + Returns: + a string of html table + ''' + + table: str = "\n" + table += "\n" + table += '\n' + for k in self.data[0].keys(): + table += '' + table += '\n' + + table += " \n" + for row in self.data: + for k in row.keys(): + table += '\n' + table += '\n' + + table += '\t
" + self.caption + "
' + k.capitalize() + '
' + str(row[k]) + '
\n' + return table + + + def create_report(self): + ''' + Creates html data for reports and calls the method that writes to html file + ''' + + # Start the page + content = ''' + + + ''' + self.caption + ''' + + +
+ \n + ''' + + # Add content to the body + content += self.create_html_table() + content += '
' + + content += "\t\n" + content += "\t\t\n" + content += '\t\t\n' + content += '\t
SummaryTimestampStatus
Cake reports' + datetime.now().strftime("%d-%m-%Y, %H:%M") + 'Success
\n' + + # Close the body and end the file + content += ''' +
+ + + ''' + + self.write_to_file(content) + + print(f"Reports created successfully, please open '{self.path}' to view") + + def get_data_from_mongo(self): + ''' + Gets a list of possible cakes filled in error from mongo whose data might not make sense, + precisely, cake data with invalid name or vegan + + Returns: + a list of dictionaries containing cake data + ''' + + connect() + + cake_objects = CakeMongoOrm.objects(Q(name=None) | Q(vegan=None)) + + return [{ + 'entry_id': cake.entry_id, + 'name': cake.name, + 'diameter_in_mm': cake.diameter_in_mm, + 'vegan': cake.vegan, + 'original_unit': cake.original_unit + } for cake in cake_objects ] + \ No newline at end of file diff --git a/etl/transformer.py b/etl/transformer.py new file mode 100644 index 0000000..24ee192 --- /dev/null +++ b/etl/transformer.py @@ -0,0 +1,151 @@ +from string import punctuation +from typing import List, Optional + +from .models import CakeModel +from .utils import split_text, get_base_unit, is_number, value_to_mm + + +class Transformer: + def __init__(self, raw_data: List[dict]): + """ + This class transforms extracted data according to the desired model + + Args: + raw_data: extracted data + """ + self.raw_data = raw_data + + def transform_data(self) -> List[CakeModel]: + """ + Transforms data + + Returns: + transformed data as a list of models + """ + transformed_cakes = list() + for in_cake in self.raw_data: + out_cake = self.transform_single_item(in_cake) + if out_cake: + transformed_cakes.append(out_cake) + return transformed_cakes + + def transform_single_item(self, input_item: dict) -> Optional[CakeModel]: + """ + Transforms single item of extracted data + + Args: + input_item: part of extracted data + + Returns: + model if transformation was successful + """ + + original_unit, diameter = self.process_diameter( + unit=input_item.get('diam_unit'), + diameter=input_item.get('cake_diameter') + ) + + new = { + 'original_unit': original_unit, + 'diameter_in_mm': diameter, + 'entry_id': input_item.get('entry'), + 'name': self.process_name(input_item.get('flavor')), + 'vegan': self.process_vegan(input_item.get('is_cake_vegan')), + } + + return CakeModel(**new) if diameter and original_unit else None + + + def process_diameter(self, unit, diameter): + ''' + Process the unit and diameter + + Args: + unit: the diameter unit + diameter: the diameter + + Returns: + original unit and processed diameter + ''' + + NON_MM_UNITS = ['in', 'm'] + diameter = diameter.strip().lower() + unit = unit.strip().lower() + + # when no units are mentioned, set to milimeters + if unit in ['', '"']: + unit = 'mm' + + # get diamter value + # if diameter value is irrecoverable (a complete string), discard + if diameter[0].isalpha() and diameter[-1].isalpha(): + return None, None + + # if diameter has units, split into diameter and units + elif diameter[-1].isalpha(): + diameter_detials = list(split_text(diameter)) + + # if units doesn't match, discard record + if get_base_unit(unit) != get_base_unit(diameter_detials[1]): + return None, None + + # if they match, continue + else: + # if they're not millimeters, convert + if get_base_unit(unit) in NON_MM_UNITS: + diameter = value_to_mm(value=float(diameter_detials[0]), unit=get_base_unit(unit)) + + # if they're in millimeters, return diameter + else: + diameter = diameter_detials[0] + + # check case diameter is in the form '2.43"' convert to ['2.43', ''] + elif diameter[-1] in punctuation: + diameter_detials = diameter.split(diameter[-1]) + diameter = diameter_detials[0] + + # when diameter has no units + else: + # check if unit is in millimeters, else convert + if get_base_unit(unit) in NON_MM_UNITS: + diameter = value_to_mm(value=float(diameter), unit=get_base_unit(unit)) + + + return get_base_unit(unit), diameter + + + def process_name(self, value): + ''' + Process and return desired cake flavor + + Args: + value: the flavour of cake + + Returns: + the accepted flavour or name if it exists + ''' + from etl import models + + value = value.strip().lower() + return value if value in models.VALID_CAKE_FLAVORS else None + + + def process_vegan(self, value): + ''' + Process and return desired vegan value + + Args: + value: the vegan value + + Returns: + True or False if vegan value exists + ''' + + value = value.strip().lower() + if value in ['t','true', 'y', 'yes']: + return True + elif value in ['f', 'false', 'n', 'no']: + return False + elif is_number(value): + return bool(float(value)) + return None \ No newline at end of file diff --git a/etl/utils.py b/etl/utils.py new file mode 100644 index 0000000..35617f3 --- /dev/null +++ b/etl/utils.py @@ -0,0 +1,34 @@ +from itertools import groupby + + +def split_text(s): + '''split str with number and yield result''' + + for k, g in groupby(s, str.isalpha): + yield ''.join(g) + + +def get_base_unit(unit): + '''Resolves the unit to one''' + + return { + 'm':'m', 'metres': 'm', + 'mm': 'mm', 'millimeters': 'm', + 'in': 'in', 'inches': 'in' + }.get(unit) + + +def is_number(n): + '''Validates if a string is a number''' + + try: + float(n) + return True + except ValueError: + return False + + +def value_to_mm(value, unit): + '''Convert values to millimeters''' + + return {'in': 25.4, 'm': 1000}[unit] * value \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..6f561d6 --- /dev/null +++ b/main.py @@ -0,0 +1,6 @@ +"""Run this script to launch the pipeline""" + +from etl.master import run_etl + +if __name__ == "__main__": + run_etl("cake_data.csv") \ No newline at end of file diff --git a/reports/reports.html b/reports/reports.html new file mode 100644 index 0000000..db8fdb9 --- /dev/null +++ b/reports/reports.html @@ -0,0 +1,882 @@ + + + + Cake Reports with Invalid Name or Vegan + + +

Cake Reports with Invalid Name or Vegan
Entry_idNameDiameter_in_mmVeganOriginal_unit
2strawberry349.25Nonein
3None480.0Falsemm
5None456.692Falsein
6butter251.95554Nonemm
7babka186.9Nonemm
8None283.5Falsemm
9chocolate293.04925Nonemm
10None451.4Nonemm
12None405.384Nonein
14None629.3567296Falsein
16None386.334Truein
17avocado492.28463Nonemm
19None2.43Nonemm
21biscuit514.0Nonemm
22None321.056Falsein
23None360000.0Nonem
25None160.22910804Truein
26None435.99012Nonemm
27None175.0Nonemm
30chiffon23.300681Nonemm
31None588.7Nonemm
32caramel518.0Nonemm
33None121.91999999999999Nonein
34None301.0Nonemm
35None12.58Falsemm
38vanilla342.8Nonemm
40butter363.21999999999997Nonein
41None382.0Falsemm
43None534.4Truemm
44None432.3Nonemm
45sponge537.21803Nonemm
46None425.45Falsein
47None554.54711036Nonein
49None246.888Nonein
51None223.0Falsemm
52None399.542Nonein
54cream558.53092Nonemm
57biscuit175.00599999999997Nonein
58None534900.0Falsem
60None556.2Truemm
61None0.65Nonemm
62None376.68199999999996Nonein
65None551.0Truemm
66None562.22131Nonemm
69None456.0Nonemm
72chiffon280.0Nonemm
73None437.0071Falsemm
75None32.004Falsein
76None476.5592678599999Truein
77caramel23.25Nonein
80rainbow12.81Nonemm
85None11.74Nonemm
86None32.22252128Nonein
88None471.678Falsein
90None369.5Nonemm
91None6.5824703Nonein
92None9.31Nonemm
94None429.11287Falsemm
95chocolate553.0Nonemm
97biscuit202.438Nonein
99apple575.0Nonemm
101carrot187.49519Nonemm
102None364.54837Nonemm
103caramel202.1Nonemm
105cream135.38526136000002Nonein
109rainbow329.0Nonemm
110chiffon563.626Nonein
111rainbow131.07657808Nonein
113None167.33247Nonemm
114None5.8846421Nonein
115None432.0Nonemm
117None447.80199999999996Nonein
121butter328.03988Nonemm
123None12.78Falsein
128black forest341.12199999999996Nonein
130None175.89768224Nonein
132None313.57076478Falsein
134None23.28Nonein
135None456.1Truemm
136apple573.7913593999999Nonein
137None251.0Truemm
138None119.63399999999999Falsein
139None24.09Falsein
144None18.7984662Nonein
146biscuit599.78417Nonemm
148None166.941Falsemm
149None502.66599999999994Nonein
151None273.1Falsemm
152None21.9Truein
153None421.1Nonemm
154butter88.646Nonein
156None227000.0Falsem
158None24.3589237Falsein
159None72.39Truein
161None35.328749Nonemm
162sponge496531.85000000003Nonem
163None528.96386Falsemm
164None12.8Truemm
166avocado81.23653297999999Nonein
167biscuit189600.0Nonem
168None266.0Falsemm
170None603.25Falsein
174None621.0364541399999Nonein
175carrot351000.0Nonem
176apple363.474Nonein
178None214.61343Truemm
179None265.0Nonemm
182None214000.0Nonem
183None21.33Nonemm
185cream316.992Nonein
186None230330.24Nonem
187None196.3Falsemm
188None354.584Falsein
190None381.17572736Truein
191None275.54087Truemm
193None543.77495258Nonein
194None488.9Falsemm
195None425.8Falsemm
198None56.640123Nonemm
199None335.68962Falsemm
201biscuit402.0Nonemm
203vanilla4.7152478Nonemm
204biscuit504.0Nonemm
205None54.944515460000005Nonein
206biscuit353.06Nonein
207None292.0Falsemm
208None233.79166602Nonein
214None301.7Truemm
215None68.05299826Nonein
216None301.94272352Falsein
217None71.88199999999999Falsein
218black forest267.71599999999995Nonein
219None505.0Truemm
224None374.142Nonein
225None86.868Falsein
229None545.592Falsein
232babka554.2Nonem
234None440.2Nonemm
236avocado555184.74Nonem
237None308.1Falsemm
238None249.6Nonemm
239None162.3Falsemm
241None596.9Nonein
+
+ + +
SummaryTimestampStatus
Cake reports07-03-2022, 12:20Success
+ +
+ + + \ No newline at end of file diff --git a/reports/test_reports.html b/reports/test_reports.html new file mode 100644 index 0000000..5985f23 --- /dev/null +++ b/reports/test_reports.html @@ -0,0 +1,36 @@ + + + + Test Reports (created from unit test) + + +
+ + + + + + + + + + + + + + + + + + + +
Test Reports (created from unit test)
Entry_idNameDiameter_in_mmVeganOriginal_unit
58None534900.0Falsem
60None556.2Truemm
+
+ + +
SummaryTimestampStatus
Cake reports07-03-2022, 12:19Success
+ +
+ + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..23a012f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +mongoengine==0.24.0 +mongomock==4.0.0 +pydantic==1.9.0 +pymongo==4.0 +typing_extensions==4.1.1 \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/__pycache__/__init__.cpython-38.pyc b/tests/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..f9c5e2c Binary files /dev/null and b/tests/__pycache__/__init__.cpython-38.pyc differ diff --git a/tests/__pycache__/test_extractor.cpython-38.pyc b/tests/__pycache__/test_extractor.cpython-38.pyc new file mode 100644 index 0000000..cb6930c Binary files /dev/null and b/tests/__pycache__/test_extractor.cpython-38.pyc differ diff --git a/tests/__pycache__/test_loader.cpython-38.pyc b/tests/__pycache__/test_loader.cpython-38.pyc new file mode 100644 index 0000000..b7a951a Binary files /dev/null and b/tests/__pycache__/test_loader.cpython-38.pyc differ diff --git a/tests/__pycache__/test_models.cpython-38.pyc b/tests/__pycache__/test_models.cpython-38.pyc new file mode 100644 index 0000000..1451f66 Binary files /dev/null and b/tests/__pycache__/test_models.cpython-38.pyc differ diff --git a/tests/__pycache__/test_reports.cpython-38.pyc b/tests/__pycache__/test_reports.cpython-38.pyc new file mode 100644 index 0000000..813c5b7 Binary files /dev/null and b/tests/__pycache__/test_reports.cpython-38.pyc differ diff --git a/tests/__pycache__/test_transformer.cpython-38.pyc b/tests/__pycache__/test_transformer.cpython-38.pyc new file mode 100644 index 0000000..c5a272a Binary files /dev/null and b/tests/__pycache__/test_transformer.cpython-38.pyc differ diff --git a/tests/test_extractor.py b/tests/test_extractor.py new file mode 100644 index 0000000..5cc65b2 --- /dev/null +++ b/tests/test_extractor.py @@ -0,0 +1,21 @@ +from unittest.case import TestCase + +from etl.extractor import Extractor + + +class TestExtractor(TestCase): + """ + Test Extractor + """ + + def test_extractor(self): + '''Assert data is extracted properly''' + + file_path = './cake_data.csv' + extractor = Extractor(in_file_path=file_path) + data = extractor.extract_data() + + self.assertIsInstance(data, list) + self.assertIsInstance(data[0], dict) + self.assertEqual(len(data[0]), 5) + self.assertIsNotNone(data[0].get('entry')) diff --git a/tests/test_loader.py b/tests/test_loader.py new file mode 100644 index 0000000..a49bd75 --- /dev/null +++ b/tests/test_loader.py @@ -0,0 +1,55 @@ +from unittest import TestCase + +import mongoengine as me + +from etl.loader import Loader +from etl.models import CakeMongoOrm, CakeModel + + + +class TestLoader(TestCase): + """ + Test Loader + """ + + @classmethod + def setUpClass(cls): + me.connect('caketest', host='mongomock://localhost') + + @classmethod + def tearDownClass(cls): + me.disconnect() + + def test_load_data(self): + '''Assert loader works properly''' + + cake_data = [ + CakeModel( + entry_id=180, + diameter_in_mm=522, + name='cream', + original_unit='mm', + vegan=False + ), + CakeModel( + entry_id=201, + diameter_in_mm=400, + name='strawberry', + original_unit='mm', + vegan=True + ) + ] + + loader = Loader(cake_data, test_mode=True) + loader.load_data() + cake_count = CakeMongoOrm.objects().count() + + cake = CakeMongoOrm.objects(entry_id=cake_data[0].entry_id).first() + + self.assertEqual(len(cake_data), cake_count) + + self.assertEqual(cake_data[0].original_unit, cake.original_unit) + self.assertEqual(cake_data[0].diameter_in_mm, cake.diameter_in_mm) + self.assertEqual(cake_data[0].entry_id, cake.entry_id) + self.assertEqual(cake_data[0].name, cake.name) + self.assertEqual(cake_data[0].vegan, cake.vegan) \ No newline at end of file diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..eaa8d4b --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,89 @@ +from unittest import TestCase + +import mongoengine as me + +from etl.models import CakeMongoOrm, CakeModel + + +class TestCakeModel(TestCase): + """ + Test Pydantic model of a cake used for data validation + """ + + def test_data_validation(self): + '''Assert that well formed data is consumed as expected''' + + transformed_data = { + 'original_unit': 'mm', + 'diameter_in_mm': '440.2', + 'entry_id': '234', + 'name': None, + 'vegan': True + } + cake_model = CakeModel(**transformed_data) + + self.assertEqual(transformed_data['original_unit'], cake_model.original_unit) + self.assertEqual(float(transformed_data['diameter_in_mm']), cake_model.diameter_in_mm) + self.assertEqual(int(transformed_data['entry_id']), cake_model.entry_id) + self.assertIsNone(transformed_data['name'], cake_model.name) + self.assertTrue(cake_model.vegan) + + +class TestCakeMongoOrm(TestCase): + """ + Test Mongoengine model of Cake document + """ + + @classmethod + def setUpClass(cls): + me.connect('caketest', host='mongomock://localhost') + + @classmethod + def tearDownClass(cls): + me.disconnect() + + def test_object_creation(self): + '''Assert data is created properply''' + + data = { + 'original_unit': 'mm', + 'diameter_in_mm': '440.2', + 'entry_id': '234', + 'name': 'strawberry', + 'vegan': True + } + validated_data = CakeModel(**data) + CakeMongoOrm(**validated_data.dict()).save() + cake = CakeMongoOrm.objects(entry_id=234).first() + + self.assertEqual(validated_data.original_unit, cake.original_unit) + self.assertEqual(validated_data.diameter_in_mm, cake.diameter_in_mm) + self.assertEqual(validated_data.entry_id, cake.entry_id) + self.assertEqual(validated_data.name, cake.name) + self.assertEqual(validated_data.vegan, cake.vegan) + + def test_bulk_object_creation(self): + '''Assert bulk data creation works properly''' + + bulk_data = [ + { + 'diameter_in_mm': '514.2', + 'entry_id': '200', + 'name': 'cream', + 'original_unit': 'mm', + 'vegan': False + }, + { + 'diameter_in_mm': '402', + 'entry_id': '201', + 'name': 'strawberry', + 'original_unit': 'mm', + 'vegan': True + }, + ] + bulk_validated_data = [CakeModel(**data).dict() for data in bulk_data] + cakes = [CakeMongoOrm(**data) for data in bulk_validated_data] + CakeMongoOrm.objects.insert(cakes) + cake_count = CakeMongoOrm.objects().count() + + self.assertEqual(len(bulk_validated_data), cake_count) \ No newline at end of file diff --git a/tests/test_reports.py b/tests/test_reports.py new file mode 100644 index 0000000..9d3ca89 --- /dev/null +++ b/tests/test_reports.py @@ -0,0 +1,43 @@ +from pathlib import Path +from unittest.case import TestCase + +from etl.reports import Report + + +class TestReport(TestCase): + """ + Test Report + """ + + def setUp(self): + + self.data = [ + { + 'entry_id': 58, + 'name': None, + 'diameter_in_mm': 534900.0, + 'vegan': False, + 'original_unit': 'm' + }, + { + 'entry_id': 60, + 'name': None, + 'diameter_in_mm': 556.2, + 'vegan': True, + 'original_unit': 'mm' + } + ] + + def test_report_is_generated(self): + '''Assert that report are generated''' + + path = './reports/test_reports.html' + report = Report( + data=self.data, + caption='Test Reports (created from unit test)', + path=path + ) + report.create_report() + new_file = Path(path).resolve() + + self.assertEqual(new_file.is_file(), True) \ No newline at end of file diff --git a/tests/test_transformer.py b/tests/test_transformer.py new file mode 100644 index 0000000..92a40ea --- /dev/null +++ b/tests/test_transformer.py @@ -0,0 +1,166 @@ +from unittest.case import TestCase + +from etl.transformer import Transformer + + +class TestTransformer(TestCase): + """ + Test Transformer + """ + + def test_transformer_valid_unit_mm(self): + '''Assert that transformer converts properly''' + + transformer = Transformer( + raw_data=[ + { + "entry": "1", + "cake_diameter": "56.78", + "diam_unit": "mm", + "flavor": "caramel", + "is_cake_vegan": "No", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 1) + self.assertEqual(res.name, "caramel") + self.assertEqual(res.diameter_in_mm, 56.78) + self.assertFalse(res.vegan) + self.assertEqual(res.original_unit, "mm") + + def test_diameter_conversion(self): + '''Assert diameter in other units converts to mm''' + + transformer = Transformer( + raw_data=[ + { + "entry": "2", + "cake_diameter": "5", + "diam_unit": "m", + "flavor": "strawberry", + "is_cake_vegan": "yes", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 2) + self.assertEqual(res.name, "strawberry") + self.assertEqual(res.diameter_in_mm, 5000) + self.assertTrue(res.vegan) + self.assertEqual(res.original_unit, "m") + + def test_irrecoverable_data_quality(self): + '''Assert that record is discarded when data quality is irrecoverable''' + + transformer = Transformer( + raw_data=[ + { + "entry": "3", + "cake_diameter": "56.78mm", + "diam_unit": "in", + "flavor": "caramel", + "is_cake_vegan": "false", + }, + { + "entry": "4", + "cake_diameter": "fill this info later", + "diam_unit": "in", + "flavor": "caramel", + "is_cake_vegan": "true", + } + ] + ) + res = transformer.transform_data() + + self.assertListEqual(res, []) + + def test_mixed_diameter_value(self): + '''Assert diameter is resolved, even when it is in the form `56mm` ''' + + transformer = Transformer( + raw_data=[ + { + "entry": "5", + "cake_diameter": "56.78mm", + "diam_unit": "mm", + "flavor": "caramel", + "is_cake_vegan": "No", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 5) + self.assertEqual(res.name, "caramel") + self.assertEqual(res.diameter_in_mm, 56.78) + self.assertFalse(res.vegan) + self.assertEqual(res.original_unit, "mm") + + def test_valid_flavor(self): + '''Assert only valid flavours/name are returned''' + + transformer = Transformer( + raw_data=[ + { + "entry": "6", + "cake_diameter": "60", + "diam_unit": "mm", + "flavor": "Invalid flavour", + "is_cake_vegan": "No", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 6) + self.assertIsNone(res.name) + self.assertEqual(res.diameter_in_mm, 60) + self.assertFalse(res.vegan) + self.assertEqual(res.original_unit, "mm") + + def test_valid_vegan(self): + '''Assert that vegan is validated, invalid ones resolves to None''' + + transformer = Transformer( + raw_data=[ + { + "entry": "7", + "cake_diameter": "78", + "diam_unit": "mm", + "flavor": "caramel", + "is_cake_vegan": "Invalid Vegan", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 7) + self.assertEqual(res.name, 'caramel') + self.assertEqual(res.diameter_in_mm, 78) + self.assertIsNone(res.vegan) + self.assertEqual(res.original_unit, "mm") + + def test_valid_diameter_unit(self): + '''Assert empty diameter unit defaults to mm''' + + transformer = Transformer( + raw_data=[ + { + "entry": "8", + "cake_diameter": "80", + "diam_unit": "", + "flavor": "caramel", + "is_cake_vegan": "y", + } + ] + ) + res = transformer.transform_data()[0] + + self.assertEqual(res.entry_id, 8) + self.assertEqual(res.name, 'caramel') + self.assertEqual(res.diameter_in_mm, 80) + self.assertTrue(res.vegan) + self.assertEqual(res.original_unit, "mm") \ No newline at end of file