diff --git a/Notebook.ipynb b/Notebook.ipynb index a5bfd19..8e17ffa 100644 --- a/Notebook.ipynb +++ b/Notebook.ipynb @@ -44,6 +44,8 @@ "import seaborn as sns\n", "import boto3\n", "\n", + "from trino.dbapi import connect\n", + "\n", "import sklearn\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.model_selection import train_test_split\n", @@ -102,19 +104,34 @@ "bucketName='PASTE-YOUR-S3-BUCKET-NAME-WHICH-HAS-DATASET'\n", "\n", "# Set the current working directory to the base of this project\n", - "os.chdir('/opt/app-root/src/rhods-fraud-detection/')\n", - "\n", - "## Helper function to download the \"clean\" folder from s3 bucket\n", - "def downloadDirectoryFroms3(bucketName, remoteDirectoryName):\n", - " s3_resource = session.resource('s3')\n", - " bucket = s3_resource.Bucket(bucketName)\n", - " for obj in bucket.objects.filter(Prefix = remoteDirectoryName):\n", - " if not os.path.exists(os.path.dirname(obj.key)):\n", - " os.makedirs(os.path.dirname(obj.key))\n", - " bucket.download_file(obj.key, obj.key)\n", - "\n", - "# Call the helper function\n", - "downloadDirectoryFroms3(bucketName, remoteDirectoryName)" + "os.chdir('/opt/app-root/src/rhods-fraud-detection/')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f2fcc0e", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Take snapshot of materialized view to save in s3 (showcase write to s3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c65ac752", + "metadata": {}, + "outputs": [], + "source": [ + "# Connect sqlalchemy to Starburst\n", + "conn = connect(\n", + " host=\"coordinator.starburst.svc.cluster.local\",\n", + " port=8080,\n", + " user=\"user\",\n", + " catalog=\"s3\",\n", + " schema=\"fraud\",\n", + ")" ] }, { @@ -128,12 +145,19 @@ }, "outputs": [], "source": [ - "path = os.getcwd()+'/clean'\n", - "csv_file = glob.glob(os.path.join(path, \"*\"))\n", - "\n", - "# Clean folder should have one file therefore we have array index as 0\n", - "raw_df = pd.read_csv(csv_file[0])\n", - "raw_df.head()" + "raw_df = pd.read_sql(\"SELECT * FROM s3.fraud.data\", conn)\n", + "raw_df = raw_df.drop(\"id\", axis=1) # Remove \"id\" column for model training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "338675c7", + "metadata": {}, + "outputs": [], + "source": [ + "for col in raw_df.iloc[:, 0:28]:\n", + " raw_df[col] = raw_df[col].astype(np.float64)" ] }, { @@ -321,10 +345,13 @@ " keras.metrics.Accuracy(name=\"Accuracy\"),\n", " keras.metrics.Precision(name=\"Precision\"),\n", " keras.metrics.Recall(name=\"Recall\")]\n", + "\n", "# Compiling and fiting the model\n", "model.compile(optimizer = \"adam\", loss = \"binary_crossentropy\", metrics = metrics)\n", + "\n", "# Change the epochs to a lower number if you want this to run quickly. But lower epoch is less accuracy and vice versa.\n", "model.fit(X_train_SMOTE, y_train_SMOTE, batch_size = 32, epochs = 100)\n", + "\n", "print(\"Evaluate on test data\")\n", "score = model.evaluate(X_test, y_test)\n", "print(\"test loss, test accuracy, test precision, test recall:\", score)" diff --git a/README.md b/README.md index 6db8350..b51c840 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ content, you will become familiar with the OpenShift Data Science offering and common workflows to use with it. This repo uses RHODS ModelMesh to deploy the model which uses OpenVino Model Server (OVMS) under the hood. +This version of the demonstration showcases several important tools for data science workflows in Starburst. The demonstration covers reading/writing from AWS S3, writing federated queries from multiple data sources, and utilizing materialized views. + ## Access OpenShift Data Science **IMPORTANT!!** @@ -63,7 +65,7 @@ Click the Git icon and then click _Clone a Repository_. In the window that pops up, copy the Git URL for this repository and paste it into the box: ``` -https://github.com/RHEcosystemAppEng/rhods-fraud-detection +https://github.com/aceriverson/rhods-fraud-detection ``` Then, click _CLONE_. diff --git a/Starburst.md b/Starburst.md index 061688c..d5b5adb 100644 --- a/Starburst.md +++ b/Starburst.md @@ -5,7 +5,8 @@ - RHODS working cluster with administrator access - Startburst Enterprise licence [optional when using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link)] - Write access to your own Amazon S3 bucket [optional when using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link)] -- Access to the [original dataset](https://drive.google.com/file/d/1YhmV3vPbFe-JXU_biwvaizV0WGhAegH1/view) [optional when using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link)] +- Access to a PostgreSQL instance and user with SELECT, INSERT, and CREATE permissions [optional when using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link)] +- Access to the [original dataset](https://gpte-public.s3.amazonaws.com/creditcard_with_empty_values.tar.gz) [optional when using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link)] #### What is Trino and Starburst Enterprise? @@ -23,22 +24,45 @@ In this document we will cover how to work with the data of the fraud-detection example with SQL-like queries and an instance of SEP running in the Red Hat Openshift Data Science cluster. +We will take advantage of several powerful features of Starburst. We will cover reading from/writing to AWS S3, executing federated queries from multiple data sources, in addition to creating materialized views to aid in our data science workflow. + ### Steps: -1. Store the original data in S3 [optional when using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link)] +1. Store the features data in S3 [optional when using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link)] Upload the -file [creditcard_with_empty_values.csv](https://drive.google.com/file/d/1YhmV3vPbFe-JXU_biwvaizV0WGhAegH1/view) +file [features.csv](https://gpte-public.s3.amazonaws.com/creditcard_with_empty_values.tar.gz) to a bucket called `/data`. In this example, we will name it `rhods-fraud-detection`. Your AWS credentials **must** have read and write access to this bucket. -2. Set credentials and configure Starburst Enterprise [optional when using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link)] +2. Store the transactions data in Postgres [optional when using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link)] + + With the file [transactions.csv](https://gpte-public.s3.amazonaws.com/creditcard_with_empty_values.tar.gz) downloaded, execute the following queries in the psql command line tool + + ```sql + CREATE TABLE transactions ( + id SERIAL, + Time INTEGER, + Amount NUMERIC(10,2), + Class INTEGER, + PRIMARY KEY (id) + ); + ``` + ```sql + COPY transactions(id, Time, Amount, Class) + FROM '/path/to/transactions.csv' + DELIMITER ',' + CSV HEADER; + ``` + +3. Set credentials and configure Starburst Enterprise [optional when using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link)] 1. Go to the configs directory `cd ./configs` 2. Update the [01_starburst_licence.yaml](configs/01_starburst_licence.yaml) file with your own Startbust Enterprise license. 3. Update the [02_aws_credentials.yaml](configs/02_aws_credentials.yaml) file with your own Amazon credentials. - 4. Apply the configuration files `cat *.yaml | oc apply -f -` + 4. Update the [03_postgres_credentials.yaml](configs/03_postgres_credentials.yaml) file with your own PostgreSQL server and user details. + 5. Apply the configuration files `cat *.yaml | oc apply -f -`
Expected output @@ -46,13 +70,19 @@ Your AWS credentials **must** have read and write access to this bucket. $: cat *.yaml | oc apply -f - secret/starburstdata created secret/aws-credentials created + secret/postgres-credentials created starburstenterprise.charts.starburstdata.com/starburstenterprise created starbursthive.charts.starburstdata.com/starbursthive created route.route.openshift.io/starburst-web created ```
-3. Working from the Starburst Web UI + + *This next step can be done in one of two ways\*:* + 1. *Follow steps 4 and 5 to process the data in the Starburst WebUI* + 2. *Skip steps 4 and 5 then execute all data processing in the Jupyter Notebook* + +4. (Optional\*) Working from the Starburst Web UI If you are using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link) then the UI link is in the email which you receive after provisioning. @@ -60,9 +90,10 @@ Your AWS credentials **must** have read and write access to this bucket. with a route, it should be available through the URL `http://starburst-web./ui/insights/ide` and the configured credentials (`default user: admin`). At this point, you should see the query editor in the web ui + ![SEP Web UI](./images/sep_webui.png) -4. Queries to read and write using Starburst Enterprise +5. (Optional\*) Queries to read and write using Starburst Enterprise > **Note:** > 1. Please click the arrow ▶ below to view and/or copy the query > 2. Please put your own s3 bucket name in the below queries by changing the placeholder "CHANGE-THIS-BUCKET-NAME". If you are using demo.redhat.com [workshop](https://demo.redhat.com/catalog?item=babylon-catalog-prod/sandboxes-gpte.ocp4-workshop-fraud-detection.prod&utm_source=webapp&utm_medium=share-link) then the bucket name is in the email which you received after provisioning. @@ -79,9 +110,9 @@ CREATE SCHEMA s3.fraud WITH (location = 's3a://CHANGE-THIS-BUCKET-NAME/data'); Create a table reading the original dataset from S3 ```SQL -CREATE TABLE IF NOT EXISTS s3.fraud.original +CREATE TABLE IF NOT EXISTS s3.fraud.features ( - time VARCHAR, + id VARCHAR, v1 VARCHAR, v2 VARCHAR, v3 VARCHAR, @@ -109,9 +140,7 @@ CREATE TABLE IF NOT EXISTS s3.fraud.original v25 VARCHAR, v26 VARCHAR, v27 VARCHAR, - v28 VARCHAR, - amount VARCHAR, - class VARCHAR + v28 VARCHAR ) WITH ( external_location = 's3a://CHANGE-THIS-BUCKET-NAME/data/', skip_header_line_count = 1, @@ -125,76 +154,80 @@ CREATE TABLE IF NOT EXISTS s3.fraud.original Verify you can read the original data loaded into SEP ```SQL -SELECT * FROM s3.fraud.original; +SELECT * FROM s3.fraud.features; ``` + ![SEP Web UI](./images/sep_webui_reading.png)
- Set session variable + Create a materialized view with only the filtered rows using a federated query ```SQL -SHOW SESSION LIKE 'writer_%'; -SET SESSION writer_min_size = '160MB'; +CREATE MATERIALIZED VIEW s3.fraud.data AS +WITH + t1 AS ( + SELECT * FROM s3.fraud.features + WHERE v1 != '' + AND v2 != '' + AND v3 != '' + AND v4 != '' + AND v5 != '' + AND v6 != '' + AND v7 != '' + AND v8 != '' + AND v9 != '' + AND v10 != '' + AND v11 != '' + AND v12 != '' + AND v13 != '' + AND v14 != '' + AND v15 != '' + AND v16 != '' + AND v17 != '' + AND v18 != '' + AND v19 != '' + AND v20 != '' + AND v21 != '' + AND v22 != '' + AND v23 != '' + AND v24 != '' + AND v25 != '' + AND v26 != '' + AND v27 != '' + AND v28 != '' + ), + t2 AS ( + SELECT id, time, amount, class FROM postgres.public.transactions + WHERE amount IS NOT NULL AND class IS NOT NULL + ) +SELECT t1.*, t2.time, t2.amount, t2.class +FROM t1 +JOIN t2 ON t1.id = CAST(t2.id AS varchar); ``` + + +
- Create a second table with only the filtered rows + Verify you can read the combined and cleaned data from the materialized view ```SQL -CREATE TABLE IF NOT EXISTS s3.fraud.clean - WITH ( - external_location = 's3a://CHANGE-THIS-BUCKET-NAME/clean/', - format = 'csv', - skip_header_line_count=1 - ) AS ( - SELECT * - FROM s3.fraud.original - WHERE v1 != '' - AND v2 != '' - AND v3 != '' - AND v4 != '' - AND v5 != '' - AND v6 != '' - AND v7 != '' - AND v8 != '' - AND v9 != '' - AND v10 != '' - AND v11 != '' - AND v12 != '' - AND v13 != '' - AND v14 != '' - AND v15 != '' - AND v16 != '' - AND v17 != '' - AND v18 != '' - AND v19 != '' - AND v20 != '' - AND v21 != '' - AND v22 != '' - AND v23 != '' - AND v24 != '' - AND v25 != '' - AND v26 != '' - AND v27 != '' - AND v28 != '' - AND amount != '' - AND class != ''); +SELECT * FROM s3.fraud.data; ``` - -![SEP Web UI](./images/sep_webui_writing.png) - -> **Note:** This query might take some minutes depending on the network between -> RHODS and the AWS S3 bucket. -
-**Result:** Now you can verify that the S3 bucket `rhods-fraud-detection/clean`, + + +--- #### Go back to the [Notebook.ipynb](./Notebook.ipynb) and continue with the process. diff --git a/configs/03_postgres_credentials.yaml b/configs/03_postgres_credentials.yaml new file mode 100644 index 0000000..04e7254 --- /dev/null +++ b/configs/03_postgres_credentials.yaml @@ -0,0 +1,11 @@ +--- +kind: Secret +apiVersion: v1 +metadata: + name: postgres-credentials +data: + POSTGRES_HOST: + POSTGRES_PORT: + POSTGRES_USER: + POSTGRES_PASSWORD: +type: Opaque \ No newline at end of file diff --git a/configs/03_starburst_enterprise.yaml b/configs/04_starburst_enterprise.yaml similarity index 79% rename from configs/03_starburst_enterprise.yaml rename to configs/04_starburst_enterprise.yaml index 758fb79..e5967f9 100644 --- a/configs/03_starburst_enterprise.yaml +++ b/configs/04_starburst_enterprise.yaml @@ -16,6 +16,8 @@ spec: envFrom: - secretRef: name: aws-credentials + - secretRef: + name: postgres-credentials catalogs: s3: |- connector.name=hive @@ -26,10 +28,21 @@ spec: hive.allow-drop-table=true hive.metastore-cache-ttl=60m hive.compression-codec=none + materialized-views.enabled=true + materialized-views.namespace=mv + materialized-views.storage-schema=mvstorage + cache-service.uri=http://coordinator.starburst.svc.cluster.local:8180 + postgres: |- + connector.name=postgresql + connection-url=jdbc:postgresql://${ENV:POSTGRES_HOST}:${ENV:POSTGRES_PORT}/ + connection-password=${ENV:POSTGRES_PASSWORD} + connection-user=${ENV:POSTGRES_USER} coordinator: envFrom: - secretRef: name: aws-credentials + - secretRef: + name: postgres-credentials heapHeadroomPercentage: 30 heapSizePercentage: 90 nodeMemoryHeadroom: 1Gi @@ -77,6 +90,12 @@ spec: plugin.dir=/usr/lib/starburst/plugin node.server-log-file=/var/log/starburst/server.log node.launcher-log-file=/var/log/starburst/launcher.log + cache.properties: + service-database.user=starburst + service-database.password=starburst + service-database.jdbc-url=jdbc:postgresql://postgresql.starburst.svc.cluster.local:5432/ + starburst.user=user + starburst.jdbc-url=jdbc:trino://coordinator:8080 resources: limits: cpu: 0.5 diff --git a/configs/04_starburst_hive.yaml b/configs/05_starburst_hive.yaml similarity index 100% rename from configs/04_starburst_hive.yaml rename to configs/05_starburst_hive.yaml diff --git a/configs/05_starburst_route.yaml b/configs/06_starburst_route.yaml similarity index 100% rename from configs/05_starburst_route.yaml rename to configs/06_starburst_route.yaml diff --git a/requirements.txt b/requirements.txt index 6d2093d..a3e7045 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ seaborn==0.12.2 onnx==1.12.0 openvino==2022.3.0 openvino-dev==2022.3.0 -openvino-telemetry==2022.3.0 \ No newline at end of file +openvino-telemetry==2022.3.0 +trino \ No newline at end of file