forked from golang/pkgsite
-
Notifications
You must be signed in to change notification settings - Fork 0
/
000001_initial_schema_from_pg_dump.up.sql
658 lines (579 loc) · 30.4 KB
/
000001_initial_schema_from_pg_dump.up.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
-- Copyright 2019 The Go Authors. All rights reserved.
-- Use of this source code is governed by a BSD-style
-- license that can be found in the LICENSE file.
--
-- This schema migration was created by dumping the DB schema
-- as of commit bc820754c5d2bce5c3cdb66515656afb5885a440.
SET statement_timeout = 0;
SET lock_timeout = 0;
SET idle_in_transaction_session_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SET check_function_bodies = on;
SET xmloption = content;
SET client_min_messages = warning;
SET row_security = off;
CREATE FUNCTION trigger_modify_updated_at() RETURNS trigger
LANGUAGE plpgsql
AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$;
COMMENT ON FUNCTION trigger_modify_updated_at IS
'FUNCTION trigger_modify_updated_at sets the value of a column named updated_at to the current timestamp. This is used by the versions, packages, and search_documents tables as a trigger to set the value of updated_at.';
CREATE FUNCTION to_tsvector_parent_directories(package_path text, module_path text) RETURNS tsvector
LANGUAGE plpgsql PARALLEL SAFE
AS $$
DECLARE
current_directory TEXT;
parent_directories TEXT;
sub_path TEXT;
sub_directories TEXT[][];
BEGIN
IF package_path = module_path THEN
RETURN module_path::tsvector;
END IF;
IF module_path = 'std' THEN
sub_path := package_path;
ELSE
sub_path := substr(package_path, length(module_path) + 2);
current_directory := module_path;
parent_directories := module_path;
END IF;
sub_directories := regexp_split_to_array(sub_path, '/');
FOR i IN 1..cardinality(sub_directories) LOOP
IF current_directory IS NULL THEN
current_directory := sub_directories[i];
ELSE
current_directory := COALESCE(current_directory, '') || '/' || sub_directories[i];
END IF;
parent_directories = COALESCE(parent_directories, '') || ' ' || current_directory;
END LOOP;
RETURN parent_directories::tsvector;
END;
$$;
COMMENT ON FUNCTION to_tsvector_parent_directories IS
'FUNCTION to_tsvector_parent_directories computes all directories that exist between module_path and package_path, inclusive of both module_path and package_path. Return the result as a tsvector.';
CREATE TYPE version_type AS ENUM (
'release',
'prerelease',
'pseudo'
);
COMMENT ON TYPE version_type IS
'ENUM version_type specifies the version types expected for a given module version.';
CREATE TABLE modules (
module_path text NOT NULL,
version text NOT NULL,
commit_time timestamp with time zone NOT NULL,
series_path text NOT NULL,
version_type version_type NOT NULL,
readme_file_path text,
readme_contents text,
source_info jsonb,
created_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
updated_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
sort_version text NOT NULL,
redistributable boolean NOT NULL,
has_go_mod boolean,
PRIMARY KEY (module_path, version)
);
COMMENT ON TABLE modules IS
'TABLE modules contains modules at a specific semantic version.';
COMMENT ON COLUMN modules.sort_version IS
'COLUMN sort_version holds the version in a form suitable for use in ORDER BY.';
COMMENT ON COLUMN modules.redistributable IS
'COLUMN redistributable says whether the module is redistributable.';
COMMENT ON COLUMN modules.has_go_mod IS
'COLUMN has_go_mod records whether the module zip contains a go.mod file.';
CREATE INDEX idx_modules_sort_version ON modules (sort_version DESC, version_type DESC);
COMMENT ON INDEX idx_modules_sort_version IS
'INDEX idx_versions_semver_sort is used to sort versions in order of descending latest. It is used to get the latest version of a package/module and to fetch all versions of a package/module in semver order.';
CREATE INDEX idx_modules_module_path_text_pattern_ops ON modules
(module_path text_pattern_ops);
COMMENT ON INDEX idx_modules_module_path_text_pattern_ops IS
'INDEX idx_versions_module_path_text_pattern_ops is used to improve performance of LIKE statements for module_path. It is used to fetch directories matching a given module_path prefix.';
CREATE INDEX idx_modules_version_type ON modules (version_type);
COMMENT ON INDEX idx_modules_version_type IS
'INDEX idx_versions_version_type is used when fetching versions for a given version_type.';
CREATE TRIGGER set_updated_at BEFORE INSERT OR UPDATE ON modules
FOR EACH ROW EXECUTE PROCEDURE trigger_modify_updated_at();
COMMENT ON TRIGGER set_updated_at ON modules IS
'TRIGGER set_updated_at updates the value of the updated_at column to the current timestamp whenever a row is inserted or updated to the table.';
CREATE TABLE packages (
path text NOT NULL,
module_path text NOT NULL,
version text NOT NULL,
commit_time timestamp with time zone NOT NULL,
name text NOT NULL,
synopsis text,
license_types text[],
license_paths text[],
v1_path text NOT NULL,
goos text NOT NULL,
goarch text NOT NULL,
redistributable boolean DEFAULT false NOT NULL,
documentation text,
tsv_parent_directories tsvector,
created_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
updated_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
PRIMARY KEY (path, module_path, version),
FOREIGN KEY (module_path, version) REFERENCES modules(module_path, version) ON DELETE CASCADE
);
COMMENT ON TABLE packages IS
'TABLE packages contains packages in a specific module version.';
COMMENT ON COLUMN packages.commit_time IS
'commit_time is the same as verions.commit_time. It is added here so that we can reduce the number of joins in our queries.';
COMMENT ON COLUMN packages.tsv_parent_directories IS
'tsv_parent_directories should always be NOT NULL, but it is populated by a trigger, so it will be initially NULL on insert.';
CREATE INDEX idx_packages_v1_path ON packages (v1_path);
COMMENT ON INDEX idx_packages_v1_path IS
'INDEX idx_packages_v1_path is used to get all of the packages in a series.';
CREATE INDEX idx_packages_module_path_text_pattern_ops ON packages (module_path text_pattern_ops);
COMMENT ON INDEX idx_packages_module_path_text_pattern_ops IS
'INDEX idx_packages_module_path_text_pattern_ops is used to improve performance of LIKE statements for module_path. It is used to fetch directories matching a given module_path prefix.';
CREATE INDEX idx_packages_path_text_pattern_ops ON packages (path text_pattern_ops);
CREATE INDEX idx_packages_tsv_parent_directories ON packages USING gin (tsv_parent_directories);
COMMENT ON INDEX idx_packages_tsv_parent_directories IS
'INDEX idx_packages_tsv_parent_directories is used to search for packages that match a given prefix. These prefixes are stored as a tsv_vector type in tsv_parent_directories. This is used to fetch all packages in a given directory.';
CREATE FUNCTION trigger_modify_packages_tsv_parent_directories() RETURNS TRIGGER
LANGUAGE plpgsql
AS $$
BEGIN
NEW.tsv_parent_directories = to_tsvector_parent_directories(NEW.path, NEW.module_path);
RETURN NEW;
END;
$$;
COMMENT ON FUNCTION trigger_modify_packages_tsv_parent_directories IS
'FUNCTION trigger_modify_packages_tsv_parent_directories invokes FUNCTION to_tsvector_parent_directories and sets the value of tsv_parent_directories to the output.';
CREATE TRIGGER set_tsv_parent_directories BEFORE INSERT ON packages FOR EACH ROW EXECUTE PROCEDURE trigger_modify_packages_tsv_parent_directories();
COMMENT ON TRIGGER set_tsv_parent_directories ON packages IS
'TRIGGER set_tsv_parent_directories sets the value of tsv_parent_directories to the output of FUNCTION trigger_modify_search_documents_tsv_parent_directories when a new row in inserted.';
CREATE TRIGGER set_updated_at BEFORE INSERT OR UPDATE ON packages FOR EACH ROW EXECUTE PROCEDURE trigger_modify_updated_at();
COMMENT ON TRIGGER set_updated_at ON packages IS
'TRIGGER set_updated_at updates the value of the updated_at column to the current timestamp whenever a row is inserted or updated to the table.';
CREATE TABLE imports (
from_path text NOT NULL,
from_module_path text NOT NULL,
from_version text NOT NULL,
to_path text NOT NULL,
PRIMARY KEY (to_path, from_path, from_version, from_module_path),
FOREIGN KEY (from_path, from_module_path, from_version)
REFERENCES packages(path, module_path, version) ON DELETE CASCADE
);
COMMENT ON TABLE imports IS
'TABLE imports contains the imports for a package in the packages table. Package (from_path), in module (from_module_path) at version (from_version), imports package (to_path). We do not store the version and module at which to_path is imported because it is hard to compute.';
CREATE INDEX idx_imports_from_path_from_version ON imports (from_path, from_version);
COMMENT ON INDEX idx_imports_from_path_from_version IS
'INDEX idx_imports_from_path_from_version is used to improve performance of the imports tab.';
CREATE TABLE imports_unique (
to_path text NOT NULL,
from_path text NOT NULL,
from_module_path text NOT NULL,
PRIMARY KEY (to_path, from_path, from_module_path)
);
COMMENT ON TABLE imports_unique IS
'TABLE imports_unique contains the imports for a unique import_path in the packages table. The from_version is dropped; each row says that package from_path in some version of from_module_path imports (some version of) to_path. Used to speed up imported-by computations.';
CREATE TABLE licenses (
module_path text NOT NULL,
version text NOT NULL,
file_path text NOT NULL,
contents text NOT NULL,
types text[],
coverage jsonb,
PRIMARY KEY (module_path, version, file_path),
FOREIGN KEY (module_path, version) REFERENCES modules(module_path, version) ON DELETE CASCADE
);
COMMENT ON TABLE licenses IS
'TABLE licenses contains the license data for a given module version.';
COMMENT ON COLUMN licenses.coverage IS
'COLUMN coverage contains the JSON-serialized contents of the licensecheck.Coverage value returned from calling licencecheck.Cover.';
CREATE TABLE excluded_prefixes (
prefix text NOT NULL,
created_by text NOT NULL,
reason text NOT NULL,
created_at timestamp with time zone DEFAULT now(),
CONSTRAINT excluded_prefixes_created_by_check CHECK ((created_by <> ''::text)),
CONSTRAINT excluded_prefixes_prefix_check CHECK ((prefix <> ''::text)),
CONSTRAINT excluded_prefixes_reason_check CHECK ((reason <> ''::text)),
PRIMARY KEY (prefix)
);
COMMENT ON TABLE excluded_prefixes IS
'TABLE excluded_prefixes contains the prefixes of modules or groups of modules we exclude from serving and processing. This is used to deal with attacks.';
CREATE TABLE module_version_states (
module_path text NOT NULL,
version text NOT NULL,
status integer DEFAULT 0 NOT NULL,
error text DEFAULT ''::text NOT NULL,
try_count integer DEFAULT 0 NOT NULL,
last_processed_at timestamp with time zone,
next_processed_after timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
index_timestamp timestamp with time zone NOT NULL,
created_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
app_version text DEFAULT ''::text NOT NULL,
sort_version text NOT NULL,
go_mod_path text DEFAULT ''::text NOT NULL,
PRIMARY KEY (module_path, version)
);
COMMENT ON TABLE module_version_states IS
'TABLE module_version_states is used by the ETL to record the state of every module we have seen from the proxy index.';
COMMENT ON COLUMN module_version_states.sort_version IS
'COLUMN sort_version holds the version in a form suitable for use in ORDER BY. The string format is described in internal/version.ForSorting.';
COMMENT ON COLUMN module_version_states.go_mod_path IS
'COLUMN go_mod_path holds the module path from the go.mod file.';
CREATE INDEX idx_module_version_states_index_timestamp ON module_version_states (index_timestamp DESC);
COMMENT ON INDEX idx_module_version_states_index_timestamp IS
'INDEX idx_module_version_states_index_timestamp is used to get the last time a module version was fetched from the the module index.';
CREATE INDEX idx_module_version_states_last_processed_at ON module_version_states (last_processed_at);
COMMENT ON INDEX idx_module_version_states_last_processed_at IS
'INDEX idx_module_version_states_last_processed_at is used to get the next time at which a module version should be retried for processing.';
CREATE INDEX idx_module_version_states_next_processed_after ON module_version_states (next_processed_after);
COMMENT ON INDEX idx_module_version_states_next_processed_after IS
'INDEX idx_module_version_states_next_processed_after is used to get the next time at which a module version should be retried for processing.';
CREATE INDEX idx_module_version_states_sort_version ON module_version_states (sort_version DESC);
COMMENT ON INDEX idx_module_version_states_sort_version IS
'INDEX idx_module_version_states_sort_version is used to sort by version, to determine when a module version should be retried for processing.';
CREATE TABLE search_documents (
package_path text NOT NULL,
module_path text NOT NULL,
version text NOT NULL,
commit_time timestamp with time zone NOT NULL,
name text NOT NULL,
synopsis text,
license_types text[],
imported_by_count integer DEFAULT 0 NOT NULL,
redistributable boolean NOT NULL,
hll_register integer,
hll_leading_zeros integer,
tsv_parent_directories tsvector,
tsv_search_tokens tsvector NOT NULL,
created_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
updated_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
version_updated_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
imported_by_count_updated_at timestamp with time zone,
has_go_mod boolean,
PRIMARY KEY (package_path),
FOREIGN KEY (package_path, module_path, version)
REFERENCES packages(path, module_path, version) ON DELETE CASCADE
);
COMMENT ON TABLE search_documents IS
'TABLE search_documents contains a record for the latest version of each package. It is used to generate search results.';
COMMENT ON COLUMN search_documents.hll_register IS
'hll_* columns are added to help implement cardinality estimation using the hyperloglog algorithm. hll_register is the randomized bucket for this record.';
COMMENT ON COLUMN search_documents.hll_leading_zeros IS
'hll_* columns are added to help implement cardinality estimation using the hyperloglog algorithm. hll_leading_zeros is the number of leading zeros in the binary representation of hll_hash(package_path).';
COMMENT ON COLUMN search_documents.has_go_mod IS
'COLUMN has_go_mod records whether the module zip contains a go.mod file.';
CREATE INDEX idx_imported_by_count_desc ON search_documents (imported_by_count DESC);
COMMENT ON INDEX idx_imported_by_count_desc IS
'INDEX idx_imported_by_count_desc is used by popular_search to execute a partial scan of popular search documents.';
CREATE INDEX idx_hll_register_leading_zeros ON search_documents (hll_register, hll_leading_zeros DESC);
COMMENT ON INDEX idx_hll_register_leading_zeros IS
'INDEX idx_hll_register_leading_zeros allows us to quickly find the maximum number of leading zeros among search documents in each register matching a query, which is necessary for hyperloglog cardinality estimation.';
CREATE INDEX idx_search_documents_imported_by_count_updated_at ON search_documents (imported_by_count_updated_at);
COMMENT ON INDEX idx_search_documents_imported_by_count_updated_at IS
'INDEX idx_search_documents_imported_by_count_updated_at index is used for incremental update of imported_by counts.';
CREATE INDEX idx_search_documents_module_path_version_package_path ON search_documents
(package_path, module_path, version);
COMMENT ON INDEX idx_search_documents_module_path_version_package_path IS
'INDEX idx_search_documents_module_path_version_package_path is used for the FK reference to packages.';
CREATE INDEX idx_search_documents_tsv_parent_directories ON search_documents USING gin (tsv_parent_directories);
COMMENT ON INDEX idx_search_documents_tsv_parent_directories IS
'INDEX idx_search_documents_tsv_parent_directories is used to search for packages that match a given prefix. These prefixes are stored as a tsv_vector type in tsv_parent_directories. This is used to fetch all packages in a given directory.';
CREATE INDEX idx_search_documents_tsv_search_tokens ON search_documents USING gin (tsv_search_tokens);
COMMENT ON INDEX idx_search_documents_tsv_search_tokens IS
'INDEX idx_search_documents_tsv_search_tokens improves performance for full-text search.';
CREATE INDEX idx_search_documents_version_updated_at ON search_documents (version_updated_at);
COMMENT ON INDEX idx_search_documents_version_updated_at IS
'INDEX idx_search_documents_version_updated_at is used for incremental update of imported_by counts, in order to determine when the latest version of a package was last updated.';
CREATE TRIGGER set_updated_at BEFORE INSERT OR UPDATE ON search_documents
FOR EACH ROW EXECUTE PROCEDURE trigger_modify_updated_at();
COMMENT ON TRIGGER set_updated_at ON search_documents IS
'TRIGGER set_updated_at updates the value of the updated_at column to the current timestamp whenever a row is inserted or updated to the table.';
CREATE FUNCTION trigger_modify_search_documents_tsv_parent_directories() RETURNS trigger
LANGUAGE plpgsql
AS $$
BEGIN
NEW.tsv_parent_directories = to_tsvector_parent_directories(NEW.package_path, NEW.module_path);
RETURN NEW;
END;
$$;
COMMENT ON FUNCTION trigger_modify_search_documents_tsv_parent_directories IS
'FUNCTION trigger_modify_search_documents_tsv_parent_directories invokes FUNCTION to_tsvector_parent_directories and sets the value of tsv_parent_directories to the output.';
CREATE TRIGGER set_tsv_parent_directories BEFORE INSERT ON search_documents
FOR EACH ROW EXECUTE PROCEDURE trigger_modify_search_documents_tsv_parent_directories();
COMMENT ON TRIGGER set_tsv_parent_directories ON search_documents IS
'TRIGGER set_tsv_parent_directories sets the value of tsv_parent_directories to the output of FUNCTION trigger_modify_search_documents_tsv_parent_directories when a new row in inserted.';
CREATE FUNCTION hll_hash(text) RETURNS bigint
LANGUAGE sql PARALLEL SAFE
AS $_$
-- This is somewhat a hack, since there is no from_hex function in postgres.
-- Take the first 64 bits of the md5 hash by converting the hexadecimal
-- string to bitfield, and then bigint.
SELECT ('x'||substr(md5($1),1,16))::BIT(64)::BIGINT;
$_$;
COMMENT ON FUNCTION hll_hash IS
'FUNCTION hll_hash is a 64-bit integral hash function, which is used in implementing the hyperloglog cardinality estimation algorithm.';
CREATE FUNCTION hll_zeros(bigint) RETURNS integer
LANGUAGE plpgsql PARALLEL SAFE
AS $_$
BEGIN
IF $1 < 0 THEN
RETURN 0;
END IF;
-- For bigints, taking log(2, $1) is too inaccurate due to floating point
-- issues. Specifically log(2, 1<<63-1) == 63.0...
FOR i IN 0..62 LOOP
IF ((1::BIGINT<<i) - 1) >= $1 THEN
RETURN 64-i;
END IF;
END LOOP;
RETURN 1;
END; $_$;
COMMENT ON FUNCTION hll_zeros(bigint) IS
'FUNCTION hll_zeros returns the number of leading zeros in the binary representation of the given bigint.';
CREATE TYPE search_result AS (
package_path text,
module_path text,
version text,
commit_time timestamp with time zone,
imported_by_count integer,
score double precision
);
COMMENT ON TYPE search_result IS
'TYPE search_result is used to simplify the popular_search function.';
CREATE FUNCTION popular_search(rawquery text, lim integer, off integer) RETURNS SETOF search_result
LANGUAGE plpgsql
AS $$
DECLARE cur CURSOR(query TSQUERY) FOR
SELECT
package_path,
module_path,
version,
commit_time,
imported_by_count,
(
ts_rank(tsv_search_tokens, query) *
ln(exp(1)+imported_by_count) *
CASE WHEN redistributable THEN 1 ELSE 0.5 END *
-- Rather than add this `tsv_search_tokens @@ query` check to a
-- where clause, we simply annihilate the score. Adding it to the
-- where clause caused the query planner to eventually decide to
-- use the tsv_search_token gin index rather than the popular
-- index, which is exactly what this stored proc is trying to
-- avoid.
-- It seems like this should be redundant with the ts_rank factor
-- above, but in fact it is possible for ts_rank to be nonzero, yet
-- tsv_search_tokens @@ query is false (I think because ts_rank doesn't
-- have special handling for AND or OR conjunctions).
CASE WHEN tsv_search_tokens @@ query THEN 1 ELSE 0 END
) score
FROM search_documents
-- This should use the popular document index.
ORDER BY imported_by_count DESC;
-- top is the top search results, sorted by score descending, commit time
-- descending, then package_path ascending.
top search_result[];
-- res is the current search result.
res search_result;
-- last_idx is the index of the last element in top.
last_idx INT;
BEGIN
last_idx := lim+off;
top := array_fill(NULL::search_result, array[last_idx]);
OPEN cur(query := websearch_to_tsquery(rawquery));
FETCH cur INTO res;
WHILE found LOOP
IF top[last_idx] IS NULL OR res.score >= top[last_idx].score THEN
-- Insert res into top, maintaining sort order.
FOR i IN 1..last_idx LOOP
-- We want to preserve order by score desc, commit_time desc,
-- package_path asc, so insert res as soon as it sorted before top[i]
-- according to this ordering.
IF top[i] IS NULL OR
(res.score > top[i].score) OR
(res.score = top[i].score AND res.commit_time > top[i].commit_time) OR
(res.score = top[i].score AND res.commit_time = top[i].commit_time AND
res.package_path < top[i].package_path) THEN
top := (top[1:i-1] || res) || top[i:last_idx-1];
EXIT;
END IF;
END LOOP;
END IF;
IF top[last_idx].score > ln(exp(1)+res.imported_by_count) THEN
-- No subsequent document can be scored higher than our lowest scoring
-- document, as top[last_idx].score > 1.0*ln(e+imported_by_count), and
-- for all subsequent records ts_rank <= 1.0 and ln(e+imported_by_count)
-- is monotonically decreasing.
-- So we're done.
EXIT;
END IF;
FETCH cur INTO res;
END LOOP;
CLOSE cur;
RETURN QUERY SELECT * FROM UNNEST(top[off+1:last_idx])
WHERE package_path IS NOT NULL AND score > 0.1;
END; $$;
COMMENT ON FUNCTION popular_search(rawquery text, lim integer, off integer) IS
'FUNCTION popular_search is used to generate results for search. It is implemented as a stored function, so that we can use a cursor to scan search documents procedurally, and stop scanning early, whenever our search results are provably correct.';
CREATE TEXT SEARCH CONFIGURATION golang (
PARSER = pg_catalog."default" );
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR asciiword WITH simple, english_stem;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR word WITH english_stem;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR numword WITH simple;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR email WITH simple;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR url WITH simple;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR host WITH simple;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR sfloat WITH simple;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR version WITH simple;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR hword_numpart WITH simple;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR hword_part WITH english_stem;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR hword_asciipart WITH english_stem;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR numhword WITH simple;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR asciihword WITH english_stem;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR hword WITH english_stem;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR file WITH simple;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR "float" WITH simple;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR "int" WITH simple;
ALTER TEXT SEARCH CONFIGURATION golang
ADD MAPPING FOR uint WITH simple;
COMMENT ON TEXT SEARCH CONFIGURATION golang IS
'TEXT SEARCH CONFIGURATION golang is a custom search configuration used when creating tsvector for search. The url_path token type is remove, so that "github.com/foo/[email protected]" is indexed only as the full URL string, and not also"/foo/[email protected]". The asciiword token type is set to a "simple,english_stem" mapping, so that "plural" words will be indexed without stemming. This idea came from the "Morphological and Exact Search" section here: https://asp437.github.io/posts/flexible-fts.html.';
CREATE FUNCTION popular_search_go_mod(rawquery text, lim integer, off integer, redist_factor real, go_mod_factor real) RETURNS SETOF search_result
LANGUAGE plpgsql
AS $$
DECLARE cur CURSOR(query TSQUERY) FOR
SELECT
package_path,
module_path,
version,
commit_time,
imported_by_count,
(
ts_rank(tsv_search_tokens, query) *
ln(exp(1)+imported_by_count) *
CASE WHEN redistributable THEN 1 ELSE redist_factor END *
CASE WHEN COALESCE(has_go_mod, true) THEN 1 ELSE go_mod_factor END *
CASE WHEN tsv_search_tokens @@ query THEN 1 ELSE 0 END
) score
FROM search_documents
ORDER BY imported_by_count DESC;
top search_result[];
res search_result;
last_idx INT;
BEGIN
last_idx := lim+off;
top := array_fill(NULL::search_result, array[last_idx]);
OPEN cur(query := websearch_to_tsquery(rawquery));
FETCH cur INTO res;
WHILE found LOOP
IF top[last_idx] IS NULL OR res.score >= top[last_idx].score THEN
FOR i IN 1..last_idx LOOP
IF top[i] IS NULL OR
(res.score > top[i].score) OR
(res.score = top[i].score AND res.commit_time > top[i].commit_time) OR
(res.score = top[i].score AND res.commit_time = top[i].commit_time AND
res.package_path < top[i].package_path) THEN
top := (top[1:i-1] || res) || top[i:last_idx-1];
EXIT;
END IF;
END LOOP;
END IF;
IF top[last_idx].score > ln(exp(1)+res.imported_by_count) THEN
EXIT;
END IF;
FETCH cur INTO res;
END LOOP;
CLOSE cur;
RETURN QUERY SELECT * FROM UNNEST(top[off+1:last_idx])
WHERE package_path IS NOT NULL AND score > 0.1;
END; $$;
COMMENT ON FUNCTION popular_search_go_mod(rawquery text, lim integer, off integer, redist_factor real, go_mod_factor real) IS
'FUNCTION popular_search_go_mod is identical to popular_search except for the additional multiplier for the has_go_mod filed.';
SET default_tablespace = '';
SET default_with_oids = false;
CREATE TABLE alternative_module_paths (
alternative text NOT NULL,
canonical text NOT NULL,
created_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
UNIQUE(alternative, canonical)
);
COMMENT ON TABLE alternative_module_paths IS
'TABLE alternative_module_paths contains module_paths that are known to have (1) a vanity import path, such as github.com/rsc/quote vs rsc.io/quote (2) a mismatch between the module path in the go.mod and repository, such as in the case of forks, or (3) a case insensitive spelling, such as in the case of github.com/sirupsen/logrus vs github.com/Sirupsen/logrus. It is used to filter out modules with the alternative path from the discovery site dataset.';
COMMENT ON COLUMN alternative_module_paths.alternative IS
'COLUMN alternative contains the path prefix of packages that should be filtered out from the discovery site search results. For example, github.com/google/go-cloud is the alternative prefix for all packages in the modules gocloud.dev and github.com/google/go-cloud.';
COMMENT ON COLUMN alternative_module_paths.canonical IS
'COLUMN canonical contains the module path that can be found in the go.mod file of a package. For example, gocloud.dev is the canonical prefix for all packages in gocloud.dev and github.com/google/go-cloud.';
CREATE TABLE experiments (
name text NOT NULL,
rollout integer DEFAULT 0 NOT NULL,
description text NOT NULL,
PRIMARY KEY (name),
CONSTRAINT experiments_rollout_check CHECK (((rollout >= 0) AND (rollout <= 100)))
);
COMMENT ON TABLE experiments IS
'TABLE experiments contains data for running experiments.';
COMMENT ON COLUMN experiments.name IS
'COLUMN name is the name of the experiment.';
COMMENT ON COLUMN experiments.rollout IS
'COLUMN rollout is the percentage of total requests that are included for the experiment.';
COMMENT ON COLUMN experiments.description IS
'COLUMN description describes the experiment.';
CREATE TABLE package_version_states (
package_path text NOT NULL,
module_path text NOT NULL,
version text NOT NULL,
status integer NOT NULL,
error text,
created_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
updated_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
PRIMARY KEY (package_path, module_path, version),
FOREIGN KEY (module_path, version) REFERENCES module_version_states(module_path, version) ON DELETE CASCADE
);
COMMENT ON TABLE package_version_states IS
'TABLE package_version_states is used to record the state of every package we have seen from the proxy.';
CREATE TRIGGER set_updated_at BEFORE INSERT OR UPDATE ON package_version_states
FOR EACH ROW EXECUTE PROCEDURE trigger_modify_updated_at();
COMMENT ON TRIGGER set_updated_at ON package_version_states IS
'TRIGGER set_updated_at updates the value of the updated_at column to the current timestamp whenever a row is inserted or updated to the table.';
CREATE TABLE version_map (
module_path text NOT NULL,
requested_version text NOT NULL,
resolved_version text,
status integer NOT NULL,
error text,
created_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
updated_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
sort_version text,
PRIMARY KEY (module_path, requested_version)
);
COMMENT ON TABLE version_map IS
'TABLE version_map contains data about a user-requested path and the semantic version that it resolves to. It is used to support fetching frontend detail pages using module queries.';
COMMENT ON COLUMN version_map.requested_version IS
'COLUMN requested_version is the version that was requested by a user from the frontend. It may or may not resolve to a semantic version.';
COMMENT ON COLUMN version_map.resolved_version IS
'COLUMN resolved_version is the semantic version that a requested_version resolves to.';
COMMENT ON COLUMN version_map.status IS
'COLUMN status is the status returned by the ETL when fetching the module version.';
COMMENT ON COLUMN version_map.error IS
'COLUMN status is the error that occurred when fetching the module version, in cases when status != 200.';
CREATE TRIGGER set_updated_at BEFORE INSERT OR UPDATE ON version_map
FOR EACH ROW EXECUTE PROCEDURE trigger_modify_updated_at();