forked from pentaho/pentaho-kettle
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwordcount-mapper.ktr
351 lines (345 loc) · 15.9 KB
/
wordcount-mapper.ktr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
<?xml version="1.0" encoding="UTF-8"?>
<transformation>
<info>
<name>wordcount-mapper</name>
<description/>
<extended_description/>
<trans_version/>
<trans_type>Normal</trans_type>
<trans_status>0</trans_status>
<directory>/</directory>
<parameters>
</parameters>
<log>
<trans-log-table><connection/>
<schema/>
<table/>
<size_limit_lines/>
<interval/>
<timeout_days/>
<field><id>ID_BATCH</id><enabled>Y</enabled><name>ID_BATCH</name></field><field><id>CHANNEL_ID</id><enabled>Y</enabled><name>CHANNEL_ID</name></field><field><id>TRANSNAME</id><enabled>Y</enabled><name>TRANSNAME</name></field><field><id>STATUS</id><enabled>Y</enabled><name>STATUS</name></field><field><id>LINES_READ</id><enabled>Y</enabled><name>LINES_READ</name><subject/></field><field><id>LINES_WRITTEN</id><enabled>Y</enabled><name>LINES_WRITTEN</name><subject/></field><field><id>LINES_UPDATED</id><enabled>Y</enabled><name>LINES_UPDATED</name><subject/></field><field><id>LINES_INPUT</id><enabled>Y</enabled><name>LINES_INPUT</name><subject/></field><field><id>LINES_OUTPUT</id><enabled>Y</enabled><name>LINES_OUTPUT</name><subject/></field><field><id>LINES_REJECTED</id><enabled>Y</enabled><name>LINES_REJECTED</name><subject/></field><field><id>ERRORS</id><enabled>Y</enabled><name>ERRORS</name></field><field><id>STARTDATE</id><enabled>Y</enabled><name>STARTDATE</name></field><field><id>ENDDATE</id><enabled>Y</enabled><name>ENDDATE</name></field><field><id>LOGDATE</id><enabled>Y</enabled><name>LOGDATE</name></field><field><id>DEPDATE</id><enabled>Y</enabled><name>DEPDATE</name></field><field><id>REPLAYDATE</id><enabled>Y</enabled><name>REPLAYDATE</name></field><field><id>LOG_FIELD</id><enabled>Y</enabled><name>LOG_FIELD</name></field></trans-log-table>
<perf-log-table><connection/>
<schema/>
<table/>
<interval/>
<timeout_days/>
<field><id>ID_BATCH</id><enabled>Y</enabled><name>ID_BATCH</name></field><field><id>SEQ_NR</id><enabled>Y</enabled><name>SEQ_NR</name></field><field><id>LOGDATE</id><enabled>Y</enabled><name>LOGDATE</name></field><field><id>TRANSNAME</id><enabled>Y</enabled><name>TRANSNAME</name></field><field><id>STEPNAME</id><enabled>Y</enabled><name>STEPNAME</name></field><field><id>STEP_COPY</id><enabled>Y</enabled><name>STEP_COPY</name></field><field><id>LINES_READ</id><enabled>Y</enabled><name>LINES_READ</name></field><field><id>LINES_WRITTEN</id><enabled>Y</enabled><name>LINES_WRITTEN</name></field><field><id>LINES_UPDATED</id><enabled>Y</enabled><name>LINES_UPDATED</name></field><field><id>LINES_INPUT</id><enabled>Y</enabled><name>LINES_INPUT</name></field><field><id>LINES_OUTPUT</id><enabled>Y</enabled><name>LINES_OUTPUT</name></field><field><id>LINES_REJECTED</id><enabled>Y</enabled><name>LINES_REJECTED</name></field><field><id>ERRORS</id><enabled>Y</enabled><name>ERRORS</name></field><field><id>INPUT_BUFFER_ROWS</id><enabled>Y</enabled><name>INPUT_BUFFER_ROWS</name></field><field><id>OUTPUT_BUFFER_ROWS</id><enabled>Y</enabled><name>OUTPUT_BUFFER_ROWS</name></field></perf-log-table>
<channel-log-table><connection/>
<schema/>
<table/>
<timeout_days/>
<field><id>ID_BATCH</id><enabled>Y</enabled><name>ID_BATCH</name></field><field><id>CHANNEL_ID</id><enabled>Y</enabled><name>CHANNEL_ID</name></field><field><id>LOG_DATE</id><enabled>Y</enabled><name>LOG_DATE</name></field><field><id>LOGGING_OBJECT_TYPE</id><enabled>Y</enabled><name>LOGGING_OBJECT_TYPE</name></field><field><id>OBJECT_NAME</id><enabled>Y</enabled><name>OBJECT_NAME</name></field><field><id>OBJECT_COPY</id><enabled>Y</enabled><name>OBJECT_COPY</name></field><field><id>REPOSITORY_DIRECTORY</id><enabled>Y</enabled><name>REPOSITORY_DIRECTORY</name></field><field><id>FILENAME</id><enabled>Y</enabled><name>FILENAME</name></field><field><id>OBJECT_ID</id><enabled>Y</enabled><name>OBJECT_ID</name></field><field><id>OBJECT_REVISION</id><enabled>Y</enabled><name>OBJECT_REVISION</name></field><field><id>PARENT_CHANNEL_ID</id><enabled>Y</enabled><name>PARENT_CHANNEL_ID</name></field><field><id>ROOT_CHANNEL_ID</id><enabled>Y</enabled><name>ROOT_CHANNEL_ID</name></field></channel-log-table>
<step-log-table><connection/>
<schema/>
<table/>
<timeout_days/>
<field><id>ID_BATCH</id><enabled>Y</enabled><name>ID_BATCH</name></field><field><id>CHANNEL_ID</id><enabled>Y</enabled><name>CHANNEL_ID</name></field><field><id>LOG_DATE</id><enabled>Y</enabled><name>LOG_DATE</name></field><field><id>TRANSNAME</id><enabled>Y</enabled><name>TRANSNAME</name></field><field><id>STEPNAME</id><enabled>Y</enabled><name>STEPNAME</name></field><field><id>STEP_COPY</id><enabled>Y</enabled><name>STEP_COPY</name></field><field><id>LINES_READ</id><enabled>Y</enabled><name>LINES_READ</name></field><field><id>LINES_WRITTEN</id><enabled>Y</enabled><name>LINES_WRITTEN</name></field><field><id>LINES_UPDATED</id><enabled>Y</enabled><name>LINES_UPDATED</name></field><field><id>LINES_INPUT</id><enabled>Y</enabled><name>LINES_INPUT</name></field><field><id>LINES_OUTPUT</id><enabled>Y</enabled><name>LINES_OUTPUT</name></field><field><id>LINES_REJECTED</id><enabled>Y</enabled><name>LINES_REJECTED</name></field><field><id>ERRORS</id><enabled>Y</enabled><name>ERRORS</name></field><field><id>LOG_FIELD</id><enabled>N</enabled><name>LOG_FIELD</name></field></step-log-table>
</log>
<maxdate>
<connection/>
<table/>
<field/>
<offset>0.0</offset>
<maxdiff>0.0</maxdiff>
</maxdate>
<size_rowset>10000</size_rowset>
<sleep_time_empty>50</sleep_time_empty>
<sleep_time_full>50</sleep_time_full>
<unique_connections>N</unique_connections>
<feedback_shown>Y</feedback_shown>
<feedback_size>50000</feedback_size>
<using_thread_priorities>Y</using_thread_priorities>
<shared_objects_file/>
<capture_step_performance>N</capture_step_performance>
<step_performance_capturing_delay>1000</step_performance_capturing_delay>
<step_performance_capturing_size_limit>100</step_performance_capturing_size_limit>
<dependencies>
</dependencies>
<partitionschemas>
</partitionschemas>
<slaveservers>
</slaveservers>
<clusterschemas>
</clusterschemas>
<created_user/>
<created_date>2010/12/30 23:29:15.795</created_date>
<modified_user>-</modified_user>
<modified_date>2010/07/15 10:12:26.133</modified_date>
</info>
<notepads>
</notepads>
<connection>
<name>CMS</name>
<server>${CMS_HOSTNAME}</server>
<type>MYSQL</type>
<access>Native</access>
<database>${CMS_DATABASE}</database>
<port>${CMS_PORT}</port>
<username>${CMS_USERNAME}</username>
<password>${CMS_PASSWORD}</password>
<servername/>
<data_tablespace/>
<index_tablespace/>
<attributes>
<attribute><code>FORCE_IDENTIFIERS_TO_LOWERCASE</code><attribute>N</attribute></attribute>
<attribute><code>FORCE_IDENTIFIERS_TO_UPPERCASE</code><attribute>N</attribute></attribute>
<attribute><code>IS_CLUSTERED</code><attribute>N</attribute></attribute>
<attribute><code>PORT_NUMBER</code><attribute>${CMS_PORT}</attribute></attribute>
<attribute><code>QUOTE_ALL_FIELDS</code><attribute>N</attribute></attribute>
<attribute><code>STREAM_RESULTS</code><attribute>Y</attribute></attribute>
<attribute><code>USE_POOLING</code><attribute>N</attribute></attribute>
</attributes>
</connection>
<connection>
<name>DWH</name>
<server>${DWH_HOSTNAME}</server>
<type>MYSQL</type>
<access>Native</access>
<database>${DWH_DATABASE}</database>
<port>${DWH_PORT}</port>
<username>${DWH_USERNAME}</username>
<password>${DWH_PASSWORD}</password>
<servername/>
<data_tablespace/>
<index_tablespace/>
<attributes>
<attribute><code>FORCE_IDENTIFIERS_TO_LOWERCASE</code><attribute>N</attribute></attribute>
<attribute><code>FORCE_IDENTIFIERS_TO_UPPERCASE</code><attribute>N</attribute></attribute>
<attribute><code>IS_CLUSTERED</code><attribute>N</attribute></attribute>
<attribute><code>PORT_NUMBER</code><attribute>${DWH_PORT}</attribute></attribute>
<attribute><code>QUOTE_ALL_FIELDS</code><attribute>N</attribute></attribute>
<attribute><code>STREAM_RESULTS</code><attribute>Y</attribute></attribute>
<attribute><code>USE_POOLING</code><attribute>N</attribute></attribute>
</attributes>
</connection>
<connection>
<name>LOGGING</name>
<server>${LOGGING_HOSTNAME}</server>
<type>MYSQL</type>
<access>Native</access>
<database>${LOGGING_DATABASE}</database>
<port>${LOGGING_PORT}</port>
<username>${LOGGING_USERNAME}</username>
<password>${LOGGING_PASSWORD}</password>
<servername/>
<data_tablespace/>
<index_tablespace/>
<attributes>
<attribute><code>FORCE_IDENTIFIERS_TO_LOWERCASE</code><attribute>N</attribute></attribute>
<attribute><code>FORCE_IDENTIFIERS_TO_UPPERCASE</code><attribute>N</attribute></attribute>
<attribute><code>IS_CLUSTERED</code><attribute>N</attribute></attribute>
<attribute><code>PORT_NUMBER</code><attribute>${LOGGING_PORT}</attribute></attribute>
<attribute><code>QUOTE_ALL_FIELDS</code><attribute>N</attribute></attribute>
<attribute><code>STREAM_RESULTS</code><attribute>Y</attribute></attribute>
<attribute><code>USE_POOLING</code><attribute>N</attribute></attribute>
</attributes>
</connection>
<connection>
<name>Postgres</name>
<server>localhost</server>
<type>POSTGRESQL</type>
<access>Native</access>
<database>test</database>
<port>5432</port>
<username>postgres</username>
<password>Encrypted 2be98afc86aa7f2e4bb16bd64d980aac9</password>
<servername/>
<data_tablespace/>
<index_tablespace/>
<attributes>
<attribute><code>FORCE_IDENTIFIERS_TO_LOWERCASE</code><attribute>N</attribute></attribute>
<attribute><code>FORCE_IDENTIFIERS_TO_UPPERCASE</code><attribute>N</attribute></attribute>
<attribute><code>IS_CLUSTERED</code><attribute>N</attribute></attribute>
<attribute><code>PORT_NUMBER</code><attribute>5432</attribute></attribute>
<attribute><code>QUOTE_ALL_FIELDS</code><attribute>N</attribute></attribute>
<attribute><code>SUPPORTS_BOOLEAN_DATA_TYPE</code><attribute>N</attribute></attribute>
<attribute><code>USE_POOLING</code><attribute>N</attribute></attribute>
</attributes>
</connection>
<connection>
<name>STG</name>
<server>${STG_HOSTNAME}</server>
<type>MYSQL</type>
<access>Native</access>
<database>${STG_DATABASE}</database>
<port>${STG_PORT}</port>
<username>${STG_USERNAME}</username>
<password>${STG_PASSWORD}</password>
<servername/>
<data_tablespace/>
<index_tablespace/>
<attributes>
<attribute><code>FORCE_IDENTIFIERS_TO_LOWERCASE</code><attribute>N</attribute></attribute>
<attribute><code>FORCE_IDENTIFIERS_TO_UPPERCASE</code><attribute>N</attribute></attribute>
<attribute><code>IS_CLUSTERED</code><attribute>N</attribute></attribute>
<attribute><code>PORT_NUMBER</code><attribute>${STG_PORT}</attribute></attribute>
<attribute><code>QUOTE_ALL_FIELDS</code><attribute>N</attribute></attribute>
<attribute><code>STREAM_RESULTS</code><attribute>Y</attribute></attribute>
<attribute><code>USE_POOLING</code><attribute>N</attribute></attribute>
</attributes>
</connection>
<order>
<hop> <from>Dev Generator</from><to>Split words to rows</to><enabled>N</enabled> </hop> <hop> <from>Split words to rows</from><to>Add value</to><enabled>Y</enabled> </hop> <hop> <from>Add value</from><to>Remove garbage</to><enabled>Y</enabled> </hop> <hop> <from>Remove garbage</from><to>Output</to><enabled>Y</enabled> </hop> <hop> <from>Injector</from><to>Split words to rows</to><enabled>Y</enabled> </hop> </order>
<step>
<name>Add value</name>
<type>Constant</type>
<description/>
<distribute>Y</distribute>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<fields>
<field>
<name>outValue</name>
<type>Integer</type>
<format/>
<currency/>
<decimal/>
<group/>
<nullif>1</nullif>
<length>-1</length>
<precision>-1</precision>
</field>
</fields>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>510</xloc>
<yloc>222</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Dev Generator</name>
<type>RowGenerator</type>
<description/>
<distribute>Y</distribute>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<fields>
<field>
<name>key</name>
<type>String</type>
<format/>
<currency/>
<decimal/>
<group/>
<nullif>14</nullif>
<length>-1</length>
<precision>-1</precision>
</field>
<field>
<name>value</name>
<type>String</type>
<format/>
<currency/>
<decimal/>
<group/>
<nullif>Hello world good bye world</nullif>
<length>-1</length>
<precision>-1</precision>
</field>
</fields>
<limit>1</limit>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>353</xloc>
<yloc>59</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Injector</name>
<type>Injector</type>
<description/>
<distribute>Y</distribute>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<fields> <field> <name>key</name>
<type>String</type>
<length>-1</length>
<precision>-1</precision>
</field> <field> <name>value</name>
<type>String</type>
<length>-1</length>
<precision>-1</precision>
</field> </fields> <cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>167</xloc>
<yloc>221</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Output</name>
<type>Dummy</type>
<description/>
<distribute>Y</distribute>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>778</xloc>
<yloc>221</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Remove garbage</name>
<type>SelectValues</type>
<description/>
<distribute>Y</distribute>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<fields> <select_unspecified>N</select_unspecified>
<remove> <name>key</name>
</remove> <remove> <name>value</name>
</remove> </fields> <cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>650</xloc>
<yloc>222</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Split words to rows</name>
<type>SplitFieldToRows3</type>
<description/>
<distribute>Y</distribute>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<splitfield>value</splitfield>
<delimiter> </delimiter>
<newfield>outKey</newfield>
<rownum>N</rownum>
<rownum_field/>
<resetrownumber>Y</resetrownumber>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>352</xloc>
<yloc>221</yloc>
<draw>Y</draw>
</GUI>
</step>
<step_error_handling>
</step_error_handling>
<slave-step-copy-partition-distribution>
</slave-step-copy-partition-distribution>
<slave_transformation>N</slave_transformation>
</transformation>