sql da

Hazemx95 · Jun 29, 2023 · 84555ce · 84555ce
1 parent eea44c2
commit 84555ce
Show file tree

Hide file tree

Showing 2 changed files with 179 additions and 58 deletions.
diff --git a/06-sql-data-analysis.ipynb b/06-sql-data-analysis.ipynb
@@ -4,106 +4,150 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# SQL For Data Analysis"
+    "# Load data (San Frasisco Police Department)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "![Data Analysis Stages](./images/01sqlda.png)"
+    "1. download the sample data from the below website  \n",
+    "**[SFData](https://data.sfgov.org/Public-Safety/Police-Department-Incident-Reports-Historical-2003/tmnf-yvry/data)**\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "![Data Analysis Stages](./images/02sqlda.png)"
+    "2. save the csv file in the rdbms/databases/csv"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Profiling"
+    "3. prepare the database and load data  \n",
+    "`$ docker run -d -v /var/run/docker.sock:/tmp/docker.sock -v /etc/hosts:/tmp/hosts asami76/docker-hoster`  \n",
+    "`$ docker-compose -f postgresql.yml up`"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "* Exploratory Data Analysis  \n",
-    "* What is the data structure?  \n",
-    "* Quantitative Distribution  \n",
-    "* Visualizing results  \n",
-    "* Data Quality  \n",
-    "  "
+    "4. Attach to the postgresql container shell and load the csv into a new database"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "1. download the sample data from the below website  \n",
-    "**[SFData](https://data.sfgov.org/Public-Safety/Police-Department-Incident-Reports-Historical-2003/tmnf-yvry/data)**\n"
+    "```\n",
+    "# psql -U postgres\n",
+    "postgres=# CREATE DATABASE sfpolice;\n",
+    "poatgres=# \\c sfpolice\n",
+    "poatgres=# CREATE TABLE\n",
+    "    police_incident_reports (\n",
+    "        pd_id BIGINT,\n",
+    "        IncidentNum VARCHAR(10),\n",
+    "        \"Incident Code\" VARCHAR(10),\n",
+    "        Category VARCHAR(50),\n",
+    "        Descript VARCHAR(100),\n",
+    "        DayOfWeek VARCHAR(10),\n",
+    "        Date DATE,\n",
+    "        Time TIME,\n",
+    "        PdDistrict VARCHAR(10),\n",
+    "        Resolution VARCHAR(50),\n",
+    "        Address VARCHAR(100),\n",
+    "        X NUMERIC(9, 6),\n",
+    "        Y NUMERIC(9, 6),\n",
+    "        location VARCHAR(55),\n",
+    "        \"SF Find Neighborhoods 2 2\" FLOAT,\n",
+    "        \"Current Police Districts 2 2\" INT,\n",
+    "        \"Current Supervisor Districts 2 2\" INT,\n",
+    "        \"Analysis Neighborhoods 2 2\" INT,\n",
+    "        \"DELETE - Fire Prevention Districts 2 2\" INT,\n",
+    "        \"DELETE - Police Districts 2 2\" INT,\n",
+    "        \"DELETE - Supervisor Districts 2 2\" INT,\n",
+    "        \"DELETE - Zip Codes 2 2\" INT,\n",
+    "        \"DELETE - Neighborhoods 2 2\" INT,\n",
+    "        \"DELETE - 2017 Fix It Zones 2 2\" INT,\n",
+    "        \"Civic Center Harm Reduction Project Boundary 2 2\" INT,\n",
+    "        \"Fix It Zones as of 2017-11-06 2 2\" INT,\n",
+    "        \"DELETE - HSOC Zones 2 2\" INT,\n",
+    "        \"Fix It Zones as of 2018-02-07 2 2\" INT,\n",
+    "        \"CBD, BID and GBD Boundaries as of 2017 2 2\" INT,\n",
+    "        \"Areas of Vulnerability, 2016 2 2\" INT,\n",
+    "        \"Central Market/Tenderloin Boundary 2 2\" INT,\n",
+    "        \"Central Market/Tenderloin Boundary Polygon - Updated 2 2\" INT,\n",
+    "        \"HSOC Zones as of 2018-06-05 2 2\" INT,\n",
+    "        \"OWED Public Spaces 2 2\" INT,\n",
+    "        \"Neighborhoods 2\" INT\n",
+    "    );\n",
+    "postgres=# \\COPY police_incident_reports FROM '/usr/databases/csv/Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv'  DELIMITER ',' CSV HEADER QUOTE '\"';\n",
+    "```"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "2. save the csv file in the rdbms/databases/csv"
+    "# SQL For Data Analysis"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "3. prepare the database and load data  \n",
-    "`$ docker run -d -v /var/run/docker.sock:/tmp/docker.sock -v /etc/hosts:/tmp/hosts asami76/docker-hoster`  \n",
-    "`$ docker-compose -f postgresql.yml up`"
+    "![Data Analysis Stages](./images/01sqlda.png)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "4. Attach to the postgresql container shell and load the csv into a new database"
+    "![Data Analysis Stages](./images/02sqlda.png)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "['PdId', 'IncidntNum', 'Incident Code', 'Category', 'Descript',\n",
-    "       'DayOfWeek', 'Date', 'Time', 'PdDistrict', 'Resolution', 'Address', 'X',\n",
-    "       'Y', 'location', 'SF Find Neighborhoods 2 2',\n",
-    "       'Current Police Districts 2 2', 'Current Supervisor Districts 2 2',\n",
-    "       'Analysis Neighborhoods 2 2', 'DELETE - Fire Prevention Districts 2 2',\n",
-    "       'DELETE - Police Districts 2 2', 'DELETE - Supervisor Districts 2 2',\n",
-    "       'DELETE - Zip Codes 2 2', 'DELETE - Neighborhoods 2 2',\n",
-    "       'DELETE - 2017 Fix It Zones 2 2',\n",
-    "       'Civic Center Harm Reduction Project Boundary 2 2',\n",
-    "       'Fix It Zones as of 2017-11-06  2 2', 'DELETE - HSOC Zones 2 2',\n",
-    "       'Fix It Zones as of 2018-02-07 2 2',\n",
-    "       'CBD, BID and GBD Boundaries as of 2017 2 2',\n",
-    "       'Areas of Vulnerability, 2016 2 2',\n",
-    "       'Central Market/Tenderloin Boundary 2 2',\n",
-    "       'Central Market/Tenderloin Boundary Polygon - Updated 2 2',\n",
-    "       'HSOC Zones as of 2018-06-05 2 2', 'OWED Public Spaces 2 2',\n",
-    "       'Neighborhoods 2'],\n",
-    "      dtype='object'"
+    "## Profiling"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# display a pandas dataframe with full column width \n",
-    "pd.set_option('display.max_colwidth', -1)\n"
+    "* Exploratory Data Analysis  \n",
+    "* What is the data structure?  \n",
+    "* Quantitative Distribution  \n",
+    "* Visualizing results  \n",
+    "* Data Quality  \n",
+    "  "
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Data Cleanup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Duplication  \n",
+    "* Data Cleaning  \n",
+    "* Nulls  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
   }
  ],
  "metadata": {

diff --git a/06-sql-data-analysis.sql b/06-sql-data-analysis.sql
@@ -1,18 +1,95 @@
-# get the data from the below link
-# https://data.sfgov.org/Public-Safety/Police-Department-Incident-Reports-Historical-2003/tmnf-yvry/data
-
-CREATE TABLE police_incident_reports (
-  incident_num VARCHAR(10) PRIMARY KEY,
-  category VARCHAR(50),
-  descript VARCHAR(100),
-  dayofweek VARCHAR(10),
-  date DATE,
-  time TIME,
-  pd_district VARCHAR(10),
-  resolution VARCHAR(50),
-  address VARCHAR(100),
-  x NUMERIC(9,6),
-  y NUMERIC(9,6),
-  location POINT,
-  pd_id BIGINT
-);
+-- Active: 1688066368502@@pg-db@5432@sfpolice
+-- get the data from the below link
+-- https://data.sfgov.org/Public-Safety/Police-Department-Incident-Reports-Historical-2003/tmnf-yvry/data
+
+-- make sure you have a connection created using the vscode extension
+
+-- Exploring data
+-- 1. How many police districts are there?
+SELECT COUNT(DISTINCT pddistrict) FROM police_incident_reports;
+
+-- How many neighborhoods are there?
+SELECT COUNT(DISTINCT "Neighborhoods 2") FROM police_incident_reports;
+
+-- How many incidents by neighborhood?
+SELECT "Neighborhoods 2", COUNT(*) AS COUNT FROM police_incident_reports GROUP BY "Neighborhoods 2" order by COUNT(*); -- Try to visualize in pgadmin
+
+-- Check the min and max number of incidents by neighborhood
+SELECT MAX(s.COUNT) AS MAX, MIN(s.COUNT) AS MIN, AVG(s.COUNT) As AVG 
+FROM (
+    SELECT "Neighborhoods 2" AS Neighborhood, COUNT(*) FROM police_incident_reports GROUP BY "Neighborhoods 2" order by COUNT(*)
+    ) s;
+
+
+-- KPIs for incidents by neighborhood
+-- declare min, max, and avg as variables
+
+
+CREATE OR REPLACE FUNCTION avergae_no_of_incidents()
+RETURNS INT 
+AS $$
+DECLARE 
+    average integer := 0;
+
+BEGIN
+    SELECT AVG(s.COUNT) INTO average
+    FROM (
+        SELECT "Neighborhoods 2" AS Neighborhood, COUNT(*) FROM police_incident_reports GROUP BY "Neighborhoods 2" order by COUNT(*)
+        ) s;
+    return average;
+END;
+$$ LANGUAGE plpgsql;
+
+select avergae_no_of_incidents()
+
+SELECT "Neighborhoods 2" AS Neighborhood, COUNT(*), 
+		CASE 
+			WHEN COUNT(*) > avergae_no_of_incidents() THEN 'High'
+			ELSE 'LOW'
+		END  AS State
+	FROM police_incident_reports GROUP BY "Neighborhoods 2" order by COUNT(*)
+
+
+-- replace the function with a subquery
+SELECT "Neighborhoods 2" AS Neighborhood, COUNT(*), 
+		CASE 
+			WHEN COUNT(*) > (SELECT AVG(s.COUNT) FROM (SELECT "Neighborhoods 2" AS Neighborhood, COUNT(*) FROM police_incident_reports GROUP BY "Neighborhoods 2" order by COUNT(*)) s) THEN 'High'
+			ELSE 'LOW'
+		END  AS State
+	FROM police_incident_reports GROUP BY "Neighborhoods 2" order by COUNT(*)
+
+
+-- Data Cleanup 
+-- Deduplication using Group By and Distinct 
+-- Data Cleaning using CASE WHEN and LIKE
+-- Data Cleaning using TRIM, UPPER, LOWER, INITCAP, SUBSTRING, POSITION, LENGTH, CONCAT, REPLACE, TRANSLATE, REGEXP_REPLACE, REGEXP_MATCHES, REGEXP_SPLIT_TO_ARRAY, REGEXP_SPLIT_TO_TABLE, SPLIT_PART, TO_CHAR, TO_NUMBER, TO_DATE, TO_TIMESTAMP, TO_TIMESTAMP_TZ, TO_JSON, TO_JSONB, TO_ASCII, TO_HEX, TO_BASE64, TO_REGCLASS, TO_REGPROC, TO_REGPROCEDURE, TO_REGOPER, TO_RE
+-- Type Conversion using CAST and :: 
+SELECT pd_id FROM police_incident_reports LIMIT 3; -- it's bigint now 
+SELECT pd_id*2 FROM police_incident_reports LIMIT 3; -- it's bigint now 
+SELECT (pd_id::varchar) + 2 FROM police_incident_reports LIMIT 3; -- Error as it's varchar now
+SELECT CAST(pd_id AS varchar) * 2 FROM police_incident_reports LIMIT 3; -- Error as it's varchar now
+
+-- split string 
+SELECT split_part(CAST(pd_id AS varchar), '0', 1) FROM police_incident_reports LIMIT 3; -- can be split now as it's string
+SELECT split_part(pd_id, '0', 1) FROM police_incident_reports LIMIT 3; -- Error as it's bigint now and cannot be split 
+
+
+
+-- Window Functions
+
+select * from public.police_incident_reports limit 5;
+
+select category, count(incidentnum) from police_incident_reports group by category
+
+--What if I want to display the count of incidents by category but still see the rest of the columns in the table?
+
+select category, pddistrict, count(incidentnum) from police_incident_reports group by category, pddistrict --This will not work as this still shows the breakdown of the count by category then pddistrict
+
+-- Must use the window function
+select *, count(incidentnum) over() from police_incident_reports
+
+-- take a window of the full table
+Select category, pddistrict, descript, count(incidentnum) over() from police_incident_reports
+
+-- change the scope of the window to the category alone
+Select category, pddistrict, descript, count(incidentnum) over(partition by category) from police_incident_reports