2 blogs

vplauzon · Jan 13, 2018 · 434dd24 · 434dd24
1 parent 32b1074
commit 434dd24
Show file tree

Hide file tree

Showing 8 changed files with 99,898 additions and 0 deletions.
diff --git a/dataframes/porgat.txt b/dataframes/porgat.txt
diff --git a/dataframes/sql-notebook.ipynb b/dataframes/sql-notebook.ipynb
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"code","source":["#  Replace with your container and storage account:  \"wasbs://<container>@<storage account>.blob.core.windows.net/\"\npathPrefix = \"wasbs://[email protected]/\"\n#  Fetch porgat.txt from storage account\nfile = sc.textFile(pathPrefix + \"porgat.txt\")"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["#  Remove the headers from the file:  lines starting with a star\nnoHeaders = file.filter(lambda x: len(x)>0 and x[0]!='*')\n#  Extract a pair from each line:  the leading integer and a string for the rest of the line\npaired = noHeaders.map(lambda l:  l.partition(' ')).filter(lambda t:  len(t)==3 and len(t[0])>0 and len(t[2])>0).map(lambda t: (int(t[0]), t[2]))\n#  Filter relationships as they do not start with quotes, then split the integer list\nscatteredRelationships = paired.filter(lambda (charId, text):  text[0]!='\"').map(lambda (charId, text): (charId, [int(x) for x in text.split(' ')]))\n#  Relationships for the same character id sometime spans more than a line in the file, so let's group them together\nrelationships = scatteredRelationships.reduceByKey(lambda pubList1, pubList2: pubList1 + pubList2)\n#  Filter non-relationships as they start with quotes ; remove the quotes\nnonRelationships = paired.filter(lambda (index, text):  text[0]=='\"').map(lambda (index, text):  (index, text[1:-1].strip()))\n#  Characters stop at a certain line (part of the initial header ; we hardcode it here)\ncharacters = nonRelationships.filter(lambda (charId, name): charId<=6486)\n#  Publications starts after the characters\npublications = nonRelationships.filter(lambda (charId, name): charId>6486)"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["from pyspark.sql import Row\n\n#  Let's create dataframes out of the RDDs and register them as temporary views for SQL to use\n\n#  Relationships has a list as a component, let's flat that\nflatRelationships = relationships.flatMap(lambda (charId, pubList):  [(charId, pubId) for pubId in pubList])\n#  Let's map the relationships to an RDD of rows in order to create a data frame out of it\nrelationshipsDf = spark.createDataFrame(flatRelationships.map(lambda t: Row(charId=t[0], pubId=t[1])))\n#  Register relationships as a temporary view\nrelationshipsDf.createOrReplaceTempView(\"relationships\")\n\n#  Let's do the same for characters\ncharactersDf = spark.createDataFrame(characters.map(lambda t:  Row(charId=t[0], name=t[1])))\ncharactersDf.createOrReplaceTempView(\"characters\")\n\n#  and for publications\npublicationsDf = spark.createDataFrame(publications.map(lambda t:  Row(pubId=t[0], name=t[1])))\npublicationsDf.createOrReplaceTempView(\"publications\")\n"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["%sql\n\nSELECT c1.name AS name1, c2.name AS name2, sub.charId1, sub.charId2, sub.pubCount\nFROM\n(\n  SELECT r1.charId AS charId1, r2.charId AS charId2, COUNT(r1.pubId, r2.pubId) AS pubCount\n  FROM relationships AS r1\n  CROSS JOIN relationships AS r2\n  WHERE r1.charId < r2.charId\n  AND r1.pubId=r2.pubId\n  GROUP BY r1.charId, r2.charId\n) AS sub\nINNER JOIN characters c1 ON c1.charId=sub.charId1\nINNER JOIN characters c2 ON c2.charId=sub.charId2\nORDER BY sub.pubCount DESC\nLIMIT 10"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["%sql\n\nSELECT c1.name AS name1, c2.name AS name2, c3.name AS name3, sub.charId1, sub.charId2, sub.charId3, sub.pubCount\nFROM\n(\n  SELECT r1.charId AS charId1, r2.charId AS charId2, r3.charId AS charId3, COUNT(r1.pubId, r2.pubId, r3.pubId) AS pubCount\n  FROM relationships AS r1\n  CROSS JOIN relationships AS r2\n  CROSS JOIN relationships AS r3\n  WHERE r1.charId < r2.charId\n  AND r2.charId < r3.charId\n  AND r1.pubId=r2.pubId\n  AND r2.pubId=r3.pubId\n  GROUP BY r1.charId, r2.charId, r3.charId\n) AS sub\nINNER JOIN characters c1 ON c1.charId=sub.charId1\nINNER JOIN characters c2 ON c2.charId=sub.charId2\nINNER JOIN characters c3 ON c3.charId=sub.charId3\nORDER BY sub.pubCount DESC\nLIMIT 10"],"metadata":{},"outputs":[],"execution_count":5}],"metadata":{"name":"SQL Blog","notebookId":3185064906819836},"nbformat":4,"nbformat_minor":0}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"cells":[{"cell_type":"code","source":["# Replace with your container and storage account: \"wasbs://<container>@<storage account>.blob.core.windows.net/\"\npathPrefix = \"wasbs://[email protected]/\"\n# Fetch porgat.txt from storage account\nfile = sc.textFile(pathPrefix + \"porgat.txt\")"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["# Remove the headers from the file: lines starting with a star\nnoHeaders = file.filter(lambda x: len(x)>0 and x[0]!='*')\n# Extract a pair from each line: the leading integer and a string for the rest of the line\npaired = noHeaders.map(lambda l: l.partition(' ')).filter(lambda t: len(t)==3 and len(t[0])>0 and len(t[2])>0).map(lambda t: (int(t[0]), t[2]))\n# Filter relationships as they do not start with quotes, then split the integer list\nscatteredRelationships = paired.filter(lambda (charId, text): text[0]!='\"').map(lambda (charId, text): (charId, [int(x) for x in text.split(' ')]))\n# Relationships for the same character id sometime spans more than a line in the file, so let's group them together\nrelationships = scatteredRelationships.reduceByKey(lambda pubList1, pubList2: pubList1 + pubList2)\n# Filter non-relationships as they start with quotes ; remove the quotes\nnonRelationships = paired.filter(lambda (index, text): text[0]=='\"').map(lambda (index, text): (index, text[1:-1].strip()))\n# Characters stop at a certain line (part of the initial header ; we hardcode it here)\ncharacters = nonRelationships.filter(lambda (charId, name): charId<=6486)\n# Publications starts after the characters\npublications = nonRelationships.filter(lambda (charId, name): charId>6486)"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["from pyspark.sql import Row\n\n# Let's create dataframes out of the RDDs and register them as temporary views for SQL to use\n\n# Relationships has a list as a component, let's flat that\nflatRelationships = relationships.flatMap(lambda (charId, pubList): [(charId, pubId) for pubId in pubList])\n# Let's map the relationships to an RDD of rows in order to create a data frame out of it\nrelationshipsDf = spark.createDataFrame(flatRelationships.map(lambda t: Row(charId=t[0], pubId=t[1])))\n# Register relationships as a temporary view\nrelationshipsDf.createOrReplaceTempView(\"relationships\")\n\n# Let's do the same for characters\ncharactersDf = spark.createDataFrame(characters.map(lambda t: Row(charId=t[0], name=t[1])))\ncharactersDf.createOrReplaceTempView(\"characters\")\n\n# and for publications\npublicationsDf = spark.createDataFrame(publications.map(lambda t: Row(pubId=t[0], name=t[1])))\npublicationsDf.createOrReplaceTempView(\"publications\")\n"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["%sql\n\nSELECT c1.name AS name1, c2.name AS name2, sub.charId1, sub.charId2, sub.pubCount\nFROM\n(\n SELECT r1.charId AS charId1, r2.charId AS charId2, COUNT(r1.pubId, r2.pubId) AS pubCount\n FROM relationships AS r1\n CROSS JOIN relationships AS r2\n WHERE r1.charId < r2.charId\n AND r1.pubId=r2.pubId\n GROUP BY r1.charId, r2.charId\n) AS sub\nINNER JOIN characters c1 ON c1.charId=sub.charId1\nINNER JOIN characters c2 ON c2.charId=sub.charId2\nORDER BY sub.pubCount DESC\nLIMIT 10"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["%sql\n\nSELECT c1.name AS name1, c2.name AS name2, c3.name AS name3, sub.charId1, sub.charId2, sub.charId3, sub.pubCount\nFROM\n(\n SELECT r1.charId AS charId1, r2.charId AS charId2, r3.charId AS charId3, COUNT(r1.pubId, r2.pubId, r3.pubId) AS pubCount\n FROM relationships AS r1\n CROSS JOIN relationships AS r2\n CROSS JOIN relationships AS r3\n WHERE r1.charId < r2.charId\n AND r2.charId < r3.charId\n AND r1.pubId=r2.pubId\n AND r2.pubId=r3.pubId\n GROUP BY r1.charId, r2.charId, r3.charId\n) AS sub\nINNER JOIN characters c1 ON c1.charId=sub.charId1\nINNER JOIN characters c2 ON c2.charId=sub.charId2\nINNER JOIN characters c3 ON c3.charId=sub.charId3\nORDER BY sub.pubCount DESC\nLIMIT 10"],"metadata":{},"outputs":[],"execution_count":5}],"metadata":{"name":"SQL Blog","notebookId":3185064906819836},"nbformat":4,"nbformat_minor":0}