Adding SQL Konferenz 2017 Keynote example scripts

arreygil2007 · May 3, 2018 · 9eb8703 · 9eb8703
1 parent cde2358
commit 9eb8703
Show file tree

Hide file tree

Showing 13 changed files with 390 additions and 0 deletions.
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote.sln b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote.sln
@@ -0,0 +1,34 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2013
+VisualStudioVersion = 12.0.31101.0
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{182E2583-ECAD-465B-BB50-91101D7C24CE}") = "SQLKonferenz2017-Keynote", "SQLKonferenz2017-Keynote\SQLKonferenz2017-Keynote.usqlproj", "{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|Any CPU = Release|Any CPU
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|x64.ActiveCfg = Debug|x64
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|x64.Build.0 = Debug|x64
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|x86.ActiveCfg = Debug|x86
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Debug|x86.Build.0 = Debug|x86
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|Any CPU.Build.0 = Release|Any CPU
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|x64.ActiveCfg = Release|x64
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|x64.Build.0 = Release|x64
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|x86.ActiveCfg = Release|x86
+		{FD4830ED-213B-4C23-BE5D-4B9D3957CCC2}.Release|x86.Build.0 = Release|x86
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/1.1-Files.usql b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/1.1-Files.usql
@@ -0,0 +1,8 @@
+@data = EXTRACT date string,
+                time string,
+                author string,
+                tweet string
+    FROM "/Samples/Data/Tweets/MikeDoesBigDataTweets.csv"
+    USING Extractors.Csv();
+
+OUTPUT @data TO "/Output/sqlkonf.csv" USING Outputters.Csv();
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/1.1-Fileset.usql.cs b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/1.1-Fileset.usql.cs
@@ -0,0 +1,6 @@
+using Microsoft.Analytics.Interfaces;
+using Microsoft.Analytics.Types;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/1.2-Fileset.usql b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/1.2-Fileset.usql
@@ -0,0 +1,9 @@
+@data = EXTRACT date string,
+                time string,
+                author string,
+                tweet string, 
+                origin string
+    FROM "/Samples/Data/Tweets/{origin}Tweets.csv"
+    USING Extractors.Csv();
+
+OUTPUT @data TO "/Output/sqlkonf.csv" USING Outputters.Csv();
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/2-Table.usql b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/2-Table.usql
@@ -0,0 +1,25 @@
+DROP TABLE IF EXISTS TweetData;
+CREATE TABLE TweetData (
+                date string,
+                time string,
+                author string,
+                tweet string, 
+                origin SqlArray<string>,
+                INDEX idx CLUSTERED (author)
+                DISTRIBUTED BY HASH(author) INTO 2 
+);
+
+@data = EXTRACT date string,
+                time string,
+                author string,
+                tweet string, 
+                origin string
+    FROM "/Samples/Data/Tweets/{origin}Tweets.csv"
+    USING Extractors.Csv();
+
+INSERT INTO TweetData 
+   SELECT date, time, author, tweet, ARRAY_AGG(origin.ToLowerInvariant()) AS origin
+   FROM @data
+
+   WHERE tweet.EndsWith("stop")
+   GROUP BY date, time, author, tweet;
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/3.1-Familiar-CSharp.usql b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/3.1-Familiar-CSharp.usql
@@ -0,0 +1,26 @@
+
+@m = SELECT TweetAnalysis.Udfs.get_mentions(tweet) AS mentions
+     FROM TweetData;
+
+@m = SELECT m.Substring(1) AS m
+          , "mention" AS category
+     FROM @m CROSS APPLY EXPLODE(mentions) AS t(m)
+     WHERE m != "@";
+
+@t =
+    SELECT author, "author" AS category
+    FROM TweetData
+    UNION ALL
+    SELECT *
+    FROM @m;
+
+@res = SELECT author.ToLowerInvariant() AS author
+            ,  category
+            , COUNT( * ) AS tweetcount
+       FROM @t
+       GROUP BY author.ToLowerInvariant(), category;
+
+OUTPUT @res
+TO "/Output/TweetAnalysis/MyTwitterAnalysis.csv"
+ORDER BY tweetcount DESC
+USING Outputters.Csv();
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/3.1-Familiar-CSharp.usql.cs b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/3.1-Familiar-CSharp.usql.cs
@@ -0,0 +1,27 @@
+using Microsoft.Analytics.Interfaces;
+using Microsoft.Analytics.Types.Sql;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+using System.Linq;
+
+// TweetAnalysis Code Behind
+// Show the use of a U-SQL user-defined function (UDF)
+//
+namespace TweetAnalysis
+{
+    public class Udfs
+    {
+        // SqlArray<string> get_mentions(string tweet)
+        //
+        // Returns a U-SQL array of string containing the twitter handles that were mentioned inside the tweet.
+        //
+        public static SqlArray<string> get_mentions(string tweet)
+        {
+            return new SqlArray<string>(
+                tweet.Split(new char[] { ' ', ',', '.', ':', '!', ';', '"', '“' }).Where(x => x.StartsWith("@"))
+                );
+        }
+    }
+}
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/3.2-Familiar-SQL.usql b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/3.2-Familiar-SQL.usql
@@ -0,0 +1,69 @@
+// Requires 4-USQL-ObjectModel.usql to be executed to create the TVF
+
+// Analyzes cooked TweetAuthorsAndMentions
+//
+// Does some analytics using windowing expressions.
+// 
+// 1. Shows use of windowing expressions
+// 2. Constant table with VALUES
+// 3. ARRAY Contains
+// 4. Joins
+
+// For each author and category with tweetcount > 50, get their tweetcount, the median tweetcount in the category, their percentile and absolute rank in rank order 
+@res =
+  SELECT DISTINCT
+         author, category, tweetcount
+       , PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY tweetcount ASC) 
+         OVER (PARTITION BY category) AS median_tweetcount_perhandle_category
+       , PERCENT_RANK() OVER 
+         (PARTITION BY category ORDER BY tweetcount ASC) AS relative_rank
+       , ROW_NUMBER() OVER 
+         (PARTITION BY category ORDER BY tweetcount DESC) AS absolute_rank
+FROM TweetAndMentionsTVF(DEFAULT) AS t
+WHERE tweetcount >= 50;
+
+OUTPUT @res
+TO "/Output/TweetAnalysis/MyTwitterAnalysis6.csv"
+ORDER BY absolute_rank, category ASC
+USING Outputters.Csv();
+
+// For each author who provided their tweet feed (to be fair), provide their influence measured as mentioned/authored 
+// Account for changes to tweet handles with a constant lookup table 
+// (could also be done as a standard table, or as a file that gets deployed as resource and looked up with a UDF)
+
+@tweet_handle_mapping = SELECT * FROM (VALUES ("sqlservermike","mikedoesbigdata")) AS T(old_handle, new_handle);
+
+@t =
+    SELECT n.new_handle ?? t.author AS author,
+           category,
+           tweetcount, 
+           file_origin
+    FROM TweetAndMentionsTVF(DEFAULT) AS t
+         LEFT OUTER JOIN
+             @tweet_handle_mapping AS n
+         ON t.author == n.old_handle;
+
+@t =
+    SELECT author,
+           category,
+           SUM(tweetcount) AS tweetcount
+    FROM @t
+    WHERE file_origin != null && file_origin.Contains(author)
+    GROUP BY author,
+             category;
+
+@res =
+    SELECT m.author,
+           a.tweetcount AS authored_count,
+           m.tweetcount AS mentioned_count,
+           (double) m.tweetcount / (double) a.tweetcount AS influence
+    FROM @t AS a
+         JOIN
+             @t AS m
+         ON a.author == m.author
+    WHERE a.category == "author" AND m.category == "mention";
+
+OUTPUT @res
+TO "/Output/TweetAnalysis/influencer.csv"
+ORDER BY influence DESC
+USING Outputters.Csv();
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/3.3-Unfamiliar-SQL.usql b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/3.3-Unfamiliar-SQL.usql
@@ -0,0 +1,19 @@
+@left =
+    SELECT *
+    FROM (VALUES ( 1, "x", (int?) 50 ),
+                 ( 1, "y", (int?) 60 )
+         ) AS L(K, A, C);
+
+@right =
+    SELECT *
+    FROM (VALUES ( 5, "x", 1 ),
+                 ( 6, "x", 2 ),
+                 (10, "y", 3 )
+         ) AS R(B, A, K);
+
+@res =
+    SELECT * FROM @left
+    OUTER UNION BY NAME ON (A, K)
+    SELECT * FROM @right;
+
+OUTPUT @res TO "/output/docsamples/outerunion.csv" USING Outputters.Csv();
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/4-USQL-ObjectModel.usql b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/4-USQL-ObjectModel.usql
@@ -0,0 +1,46 @@
+DROP FUNCTION IF EXISTS TweetAndMentionsTVF;
+
+CREATE FUNCTION TweetAndMentionsTVF(@origin string = null)
+RETURNS @res 
+AS
+BEGIN
+
+// In order to see the user-code inside the TVF, you have to reference the assembly inside the TVF body.
+REFERENCE ASSEMBLY TweetAnalysis; // Generated from TweetAnalysis Example 
+
+// Extract mentions
+@m =
+    SELECT origin,
+           TweetAnalysis.Udfs.get_mentions(tweet) AS mentions,
+           author AS mentioned_by
+    FROM TweetData
+    WHERE String.IsNullOrEmpty(@origin) || origin.Contains(@origin);
+
+// Combine authors and mentions
+@t =
+    SELECT origin,
+           author,
+           "author" AS category,
+           "" AS mentioned_by
+    FROM TweetData
+    UNION ALL
+    SELECT origin,
+           m.Substring(1) AS m,
+           "mention" AS category,
+           mentioned_by
+    FROM @m
+         CROSS APPLY
+             EXPLODE(mentions) AS t(m)
+    WHERE m != "@";
+
+// Count authors and mentions
+@res =
+    SELECT author.ToLowerInvariant() AS author,
+           category,
+           COUNT( * ) AS tweetcount,
+           new SQL.ARRAY<string>(ARRAY_AGG(origin).SelectMany(x => x).Distinct()) AS file_origin,
+           ARRAY_AGG(DISTINCT mentioned_by) AS mentioned_by
+    FROM @t
+    GROUP BY author.ToLowerInvariant(),
+             category;
+END;
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/5-USQL-UDOs.usql b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/5-USQL-UDOs.usql
@@ -0,0 +1,16 @@
+@in = EXTRACT begin DateTime, end DateTime, user string
+      FROM "/Samples/Blogs/MRys/Ranges/ranges.txt"
+      USING Extractors.Text(delimiter:'-');
+
+@r = REDUCE @in PRESORT begin
+     ON user 
+     PRODUCE begin DateTime, end DateTime, user string 
+     READONLY user
+     USING new ReduceSample.RangeReducer();
+
+// Remove comment to show filter push through the reducer
+// @r = SELECT * FROM @r WHERE user == "ABC";
+
+OUTPUT @r
+TO "/temp/result.csv"
+USING Outputters.Csv();
diff --git a/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/5-USQL-UDOs.usql.cs b/Examples/SQLKonferenz2017-Keynote/SQLKonferenz2017-Keynote/5-USQL-UDOs.usql.cs
@@ -0,0 +1,63 @@
+using Microsoft.Analytics.Interfaces;
+using Microsoft.Analytics.Types.Sql;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ReduceSample
+{
+    [SqlUserDefinedReducer(IsRecursive = true)]                                                                        // not sure if it can run recursive yet. Need to test with large data sets.
+    public class RangeReducer : IReducer
+    {
+        public override IEnumerable<IRow> Reduce(IRowset input, IUpdatableRow output)
+        {
+            // Init aggregation values
+            bool first_row_processed = false;
+            var begin = DateTime.MaxValue; // Dummy value to make compiler happy
+            var end = DateTime.MinValue; // Dummy value to make compiler happy
+
+            // requires that the reducer is PRESORTED on begin and READONLY on the reduce key.
+            foreach (var row in input.Rows)
+            {
+                // Initialize the first interval with the first row if i is 0
+                if (!first_row_processed)
+                {
+                    first_row_processed = true; // mark that we handled the first row
+                    begin = row.Get<DateTime>("begin");
+                    end = row.Get<DateTime>("end");
+                    // If the end is just a time and not a date, it can be earlier than the begin, indicating it is on the next day.
+                    // This let's fix up the end to the next day in that case
+                    if (end < begin) { end = end.AddDays(1); }
+                }
+                else
+                {
+                    var b = row.Get<DateTime>("begin");
+                    var e = row.Get<DateTime>("end");
+                    // fix up the date if end is earlier than begin
+                    if (e < b) { e = e.AddDays(1); }
+
+                    // if the begin is still inside the interval, increase the interval if it is longer
+                    if (b <= end)
+                    {
+                        // if the new end time is later than the current, extend the interval
+                        if (e > end) { end = e; }
+                    }
+                    else // output the previous interval and start a new one
+                    {
+                        output.Set<DateTime>("begin", begin);
+                        output.Set<DateTime>("end", end);
+                        yield return output.AsReadOnly();
+                        begin = b; end = e;
+                    } // if
+                } // if
+            } // foreach
+
+            // now output the last interval
+            output.Set<DateTime>("begin", begin);
+            output.Set<DateTime>("end", end);
+            yield return output.AsReadOnly();
+        } // Reduce
+
+    } // RangeReducer
+} // ReduceSample