python -m apache_beam.examples.wordcount --input=data.txt --output=/t…

…mp/out.beam --runner=SparkRunner --spark_version=3
GPT-RL · Jul 28, 2022 · 61c4317 · 61c4317
1 parent 4088792
commit 61c4317
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 10 deletions.
diff --git a/data.txt b/data.txt
@@ -0,0 +1,5 @@
+a
+
+b
+b
+c
diff --git a/flake.nix b/flake.nix
@@ -10,20 +10,27 @@
     utils,
   }: let
     out = system: let
-      pkgs = import nixpkgs {inherit system;};
-      #inherit (pkgs) poetry2nix;
-      #pythonEnv = pkgs.poetry2nix.mkPoetryEnv {
-      #projectDir = ./.;
-      #};
+      pkgs = import nixpkgs {
+        inherit system;
+        #overlays = [
+        #(final: prev: {
+        #apache-beam = prev.apache-beam.override {
+        #buildInputs = [prev.spark];
+        #};
+        #})
+        #];
+      };
+      inherit (pkgs) poetry2nix;
+      pythonEnv = pkgs.poetry2nix.mkPoetryEnv {
+        projectDir = ./.;
+      };
     in {
       devShell = pkgs.mkShell {
-        buildInputs = [
-          #pkgs.python310Packages.pyspark
-          #pkgs.python39Packages.apache-beam
+        buildInputs = with pkgs; [
+          python39Packages.apache-beam
           pkgs.spark
-          #pkgs.python
+          pkgs.jdk11
         ];
-        #shellHook = ''export SPARK_HOME=${pkgs.spark}'';
       };
     };
   in

diff --git a/main.py b/main.py
@@ -0,0 +1,29 @@
+import apache_beam as beam
+from apache_beam.options.pipeline_options import PipelineOptions
+from wordcount import run
+
+options = PipelineOptions([
+    "--runner=PortableRunner",
+    "--job_endpoint=localhost:8099",
+    "--environment_type=LOOPBACK"
+])
+with beam.Pipeline(options) as p:
+    pass
+    # # Read the text file[pattern] into a PCollection.
+    # lines = p | 'Read' >> ReadFromText(known_args.input)
+
+    # counts = (
+        # lines
+        # | 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
+        # | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
+        # | 'GroupAndSum' >> beam.CombinePerKey(sum))
+
+    # # Format the counts into a PCollection of strings.
+    # def format_result(word, count):
+      # return '%s: %d' % (word, count)
+
+    # output = counts | 'Format' >> beam.MapTuple(format_result)
+
+    # # Write the output using a "Write" transform that has side effects.
+    # # pylint: disable=expression-not-assigned
+    # output | 'Write' >> WriteToText(known_args.output)
diff --git a/wordcount.py b/wordcount.py
@@ -0,0 +1,105 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""A word-counting workflow."""
+
+# pytype: skip-file
+
+# beam-playground:
+#   name: WordCount
+#   description: An example that counts words in Shakespeare's works.
+#   multifile: false
+#   pipeline_options: --output output.txt
+#   context_line: 44
+#   categories:
+#     - Combiners
+#     - Options
+#     - Quickstart
+
+import argparse
+import logging
+import re
+
+import apache_beam as beam
+from apache_beam.io import ReadFromText
+from apache_beam.io import WriteToText
+from apache_beam.options.pipeline_options import PipelineOptions
+from apache_beam.options.pipeline_options import SetupOptions
+
+
+class WordExtractingDoFn(beam.DoFn):
+  """Parse each line of input text into words."""
+  def process(self, element):
+    """Returns an iterator over the words of this element.
+
+    The element is a line of text.  If the line is blank, note that, too.
+
+    Args:
+      element: the element being processed
+
+    Returns:
+      The processed element.
+    """
+    return re.findall(r'[\w\']+', element, re.UNICODE)
+
+
+def run(argv=None, save_main_session=True):
+  """Main entry point; defines and runs the wordcount pipeline."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--input',
+      dest='input',
+      default='gs://dataflow-samples/shakespeare/kinglear.txt',
+      help='Input file to process.')
+  parser.add_argument(
+      '--output',
+      dest='output',
+      required=True,
+      help='Output file to write results to.')
+  known_args, pipeline_args = parser.parse_known_args(argv)
+
+  # We use the save_main_session option because one or more DoFn's in this
+  # workflow rely on global context (e.g., a module imported at module level).
+  pipeline_options = PipelineOptions(pipeline_args)
+  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
+
+  # The pipeline will be run on exiting the with block.
+  with beam.Pipeline(options=pipeline_options) as p:
+
+    # Read the text file[pattern] into a PCollection.
+    lines = p | 'Read' >> ReadFromText(known_args.input)
+
+    counts = (
+        lines
+        | 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
+        | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
+        | 'GroupAndSum' >> beam.CombinePerKey(sum))
+
+    # Format the counts into a PCollection of strings.
+    def format_result(word, count):
+      return '%s: %d' % (word, count)
+
+    output = counts | 'Format' >> beam.MapTuple(format_result)
+
+    # Write the output using a "Write" transform that has side effects.
+    # pylint: disable=expression-not-assigned
+    output | 'Write' >> WriteToText(known_args.output)
+
+
+if __name__ == '__main__':
+  logging.getLogger().setLevel(logging.INFO)
+  run()
-Original file line number
+Diff line change
@@ -0,0 +1,5 @@
+    a
+    b
+    b
+    c