forked from bluesky-social/atproto
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Deduping indexing (bluesky-social#354)
* record processor * moving records to new processor * plugins finished * hook up to db * migration * yay tests working * Tx migration (bluesky-social#355) * migrations in txs * fix tx issue * testing * test deduping * test assertions * rm types on migrations * dont do migrations in txs, fixes pg issue
- Loading branch information
Showing
24 changed files
with
1,545 additions
and
607 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
167 changes: 167 additions & 0 deletions
167
packages/pds/src/db/migrations/20221116T234458063Z-duplicate-records.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
import { Kysely } from 'kysely' | ||
|
||
const duplicateRecordTable = 'duplicate_record' | ||
|
||
export async function up(db: Kysely<any>): Promise<void> { | ||
await db.schema | ||
.createTable(duplicateRecordTable) | ||
.addColumn('uri', 'varchar', (col) => col.primaryKey()) | ||
.addColumn('cid', 'varchar', (col) => col.notNull()) | ||
.addColumn('duplicateOf', 'varchar', (col) => col.notNull()) | ||
.addColumn('indexedAt', 'varchar', (col) => col.notNull()) | ||
.execute() | ||
|
||
await db.schema | ||
.createTable('repost_temp') | ||
.addColumn('uri', 'varchar', (col) => col.primaryKey()) | ||
.addColumn('cid', 'varchar', (col) => col.notNull()) | ||
.addColumn('creator', 'varchar', (col) => col.notNull()) | ||
.addColumn('subject', 'varchar', (col) => col.notNull()) | ||
.addColumn('subjectCid', 'varchar', (col) => col.notNull()) | ||
.addColumn('createdAt', 'varchar', (col) => col.notNull()) | ||
.addColumn('indexedAt', 'varchar', (col) => col.notNull()) | ||
.addUniqueConstraint('repost_unique_subject', ['creator', 'subject']) | ||
.execute() | ||
await db | ||
.insertInto('repost_temp') | ||
.expression((exp) => | ||
exp | ||
.selectFrom('repost') | ||
.selectAll() | ||
.where('uri', 'in', (qb) => | ||
qb | ||
.selectFrom('repost') | ||
.select(db.fn.min('uri').as('uri')) | ||
.groupBy(['creator', 'subject']), | ||
), | ||
) | ||
.execute() | ||
await db.schema.dropTable('repost').execute() | ||
await db.schema.alterTable('repost_temp').renameTo('repost').execute() | ||
|
||
await db.schema | ||
.createTable('trend_temp') | ||
.addColumn('uri', 'varchar', (col) => col.primaryKey()) | ||
.addColumn('cid', 'varchar', (col) => col.notNull()) | ||
.addColumn('creator', 'varchar', (col) => col.notNull()) | ||
.addColumn('subject', 'varchar', (col) => col.notNull()) | ||
.addColumn('subjectCid', 'varchar', (col) => col.notNull()) | ||
.addColumn('createdAt', 'varchar', (col) => col.notNull()) | ||
.addColumn('indexedAt', 'varchar', (col) => col.notNull()) | ||
.addUniqueConstraint('trend_unique_subject', ['creator', 'subject']) | ||
.execute() | ||
await db | ||
.insertInto('trend_temp') | ||
.expression((exp) => | ||
exp | ||
.selectFrom('trend') | ||
.selectAll() | ||
.where('uri', 'in', (qb) => | ||
qb | ||
.selectFrom('trend') | ||
.select(db.fn.min('uri').as('uri')) | ||
.groupBy(['creator', 'subject']), | ||
), | ||
) | ||
.execute() | ||
await db.schema.dropTable('trend').execute() | ||
await db.schema.alterTable('trend_temp').renameTo('trend').execute() | ||
|
||
await db.schema | ||
.createTable('vote_temp') | ||
.addColumn('uri', 'varchar', (col) => col.primaryKey()) | ||
.addColumn('cid', 'varchar', (col) => col.notNull()) | ||
.addColumn('creator', 'varchar', (col) => col.notNull()) | ||
.addColumn('direction', 'varchar', (col) => col.notNull()) | ||
.addColumn('subject', 'varchar', (col) => col.notNull()) | ||
.addColumn('subjectCid', 'varchar', (col) => col.notNull()) | ||
.addColumn('createdAt', 'varchar', (col) => col.notNull()) | ||
.addColumn('indexedAt', 'varchar', (col) => col.notNull()) | ||
.addUniqueConstraint('vote_unique_subject', ['creator', 'subject']) | ||
.execute() | ||
await db | ||
.insertInto('vote_temp') | ||
.expression((exp) => | ||
exp | ||
.selectFrom('vote') | ||
.selectAll() | ||
.where('uri', 'in', (qb) => | ||
qb | ||
.selectFrom('vote') | ||
.select(db.fn.min('uri').as('uri')) | ||
.groupBy(['creator', 'subject']), | ||
), | ||
) | ||
.execute() | ||
await db.schema.dropTable('vote').execute() | ||
await db.schema.alterTable('vote_temp').renameTo('vote').execute() | ||
|
||
await db.schema | ||
.createTable('follow_temp') | ||
.addColumn('uri', 'varchar', (col) => col.primaryKey()) | ||
.addColumn('cid', 'varchar', (col) => col.notNull()) | ||
.addColumn('creator', 'varchar', (col) => col.notNull()) | ||
.addColumn('subjectDid', 'varchar', (col) => col.notNull()) | ||
.addColumn('subjectDeclarationCid', 'varchar', (col) => col.notNull()) | ||
.addColumn('createdAt', 'varchar', (col) => col.notNull()) | ||
.addColumn('indexedAt', 'varchar', (col) => col.notNull()) | ||
.addUniqueConstraint('follow_unique_subject', ['creator', 'subjectDid']) | ||
.execute() | ||
await db | ||
.insertInto('follow_temp') | ||
.expression((exp) => | ||
exp | ||
.selectFrom('follow') | ||
.selectAll() | ||
.where('uri', 'in', (qb) => | ||
qb | ||
.selectFrom('follow') | ||
.select(db.fn.min('uri').as('uri')) | ||
.groupBy(['creator', 'subjectDid']), | ||
), | ||
) | ||
.execute() | ||
await db.schema.dropTable('follow').execute() | ||
await db.schema.alterTable('follow_temp').renameTo('follow').execute() | ||
|
||
await db.schema | ||
.createTable('assertion_temp') | ||
.addColumn('uri', 'varchar', (col) => col.primaryKey()) | ||
.addColumn('cid', 'varchar', (col) => col.notNull()) | ||
.addColumn('creator', 'varchar', (col) => col.notNull()) | ||
.addColumn('assertion', 'varchar', (col) => col.notNull()) | ||
.addColumn('subjectDid', 'varchar', (col) => col.notNull()) | ||
.addColumn('subjectDeclarationCid', 'varchar', (col) => col.notNull()) | ||
.addColumn('createdAt', 'varchar', (col) => col.notNull()) | ||
.addColumn('indexedAt', 'varchar', (col) => col.notNull()) | ||
.addColumn('confirmUri', 'varchar') | ||
.addColumn('confirmCid', 'varchar') | ||
.addColumn('confirmCreated', 'varchar') | ||
.addColumn('confirmIndexed', 'varchar') | ||
.addUniqueConstraint('assertion_unique_subject', [ | ||
'creator', | ||
'subjectDid', | ||
'assertion', | ||
]) | ||
.execute() | ||
await db | ||
.insertInto('assertion_temp') | ||
.expression((exp) => | ||
exp | ||
.selectFrom('assertion') | ||
.selectAll() | ||
.where('uri', 'in', (qb) => | ||
qb | ||
.selectFrom('assertion') | ||
.select(db.fn.min('uri').as('uri')) | ||
.groupBy(['creator', 'subjectDid', 'assertion']), | ||
), | ||
) | ||
.execute() | ||
await db.schema.dropTable('assertion').execute() | ||
await db.schema.alterTable('assertion_temp').renameTo('assertion').execute() | ||
} | ||
|
||
export async function down(db: Kysely<unknown>): Promise<void> { | ||
await db.schema.dropTable(duplicateRecordTable) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import { AtUri } from '@atproto/uri' | ||
import * as common from '@atproto/common' | ||
import { Kysely } from 'kysely' | ||
import { CID } from 'multiformats/cid' | ||
import { DatabaseSchema } from './database-schema' | ||
import { Message } from './message-queue/messages' | ||
import * as schemas from './schemas' | ||
import { RecordValidator, ValidationResult } from '@atproto/lexicon' | ||
|
||
type RecordProcessorParams<T, S> = { | ||
schemaId: string | ||
insertFn: ( | ||
db: Kysely<DatabaseSchema>, | ||
uri: AtUri, | ||
cid: CID, | ||
obj: T, | ||
timestamp?: string, | ||
) => Promise<S | null> | ||
findDuplicate: ( | ||
db: Kysely<DatabaseSchema>, | ||
uri: AtUri, | ||
obj: T, | ||
) => Promise<AtUri | null> | ||
deleteFn: (db: Kysely<DatabaseSchema>, uri: AtUri) => Promise<S | null> | ||
eventsForInsert: (obj: S) => Message[] | ||
eventsForDelete: (prev: S, replacedBy: S | null) => Message[] | ||
} | ||
|
||
export class RecordProcessor<T, S> { | ||
collection: string | ||
validator: RecordValidator | ||
constructor( | ||
private db: Kysely<DatabaseSchema>, | ||
private params: RecordProcessorParams<T, S>, | ||
) { | ||
this.collection = this.params.schemaId | ||
this.validator = schemas.records.createRecordValidator(this.params.schemaId) | ||
} | ||
|
||
matchesSchema(obj: unknown): obj is T { | ||
return this.validator.isValid(obj) | ||
} | ||
|
||
validateSchema(obj: unknown): ValidationResult { | ||
return this.validator.validate(obj) | ||
} | ||
|
||
async insertRecord( | ||
uri: AtUri, | ||
cid: CID, | ||
obj: unknown, | ||
timestamp?: string, | ||
): Promise<Message[]> { | ||
if (!this.matchesSchema(obj)) { | ||
throw new Error(`Record does not match schema: ${this.params.schemaId}`) | ||
} | ||
const inserted = await this.params.insertFn( | ||
this.db, | ||
uri, | ||
cid, | ||
obj, | ||
timestamp, | ||
) | ||
// if this was a new record, return events | ||
if (inserted) { | ||
return this.params.eventsForInsert(inserted) | ||
} | ||
// if duplicate, insert into duplicates table with no events | ||
const found = await this.params.findDuplicate(this.db, uri, obj) | ||
if (found) { | ||
await this.db | ||
.insertInto('duplicate_record') | ||
.values({ | ||
uri: uri.toString(), | ||
cid: cid.toString(), | ||
duplicateOf: found.toString(), | ||
indexedAt: timestamp || new Date().toISOString(), | ||
}) | ||
.execute() | ||
} | ||
return [] | ||
} | ||
|
||
async deleteRecord(uri: AtUri, cascading = false): Promise<Message[]> { | ||
const deleted = await this.params.deleteFn(this.db, uri) | ||
if (!deleted) return [] | ||
if (cascading) { | ||
await this.db | ||
.deleteFrom('duplicate_record') | ||
.where('duplicateOf', '=', uri.toString()) | ||
.execute() | ||
return this.params.eventsForDelete(deleted, null) | ||
} else { | ||
const found = await this.db | ||
.selectFrom('duplicate_record') | ||
.innerJoin('ipld_block', 'ipld_block.cid', 'duplicate_record.cid') | ||
.where('duplicateOf', '=', uri.toString()) | ||
.orderBy('indexedAt', 'asc') | ||
.limit(1) | ||
.selectAll() | ||
.executeTakeFirst() | ||
|
||
if (!found) { | ||
return this.params.eventsForDelete(deleted, null) | ||
} | ||
const record = common.ipldBytesToRecord(found.content) | ||
if (!this.matchesSchema(record)) { | ||
return this.params.eventsForDelete(deleted, null) | ||
} | ||
const inserted = await this.params.insertFn( | ||
this.db, | ||
new AtUri(found.uri), | ||
CID.parse(found.cid), | ||
record, | ||
found.indexedAt, | ||
) | ||
return this.params.eventsForDelete(deleted, inserted) | ||
} | ||
} | ||
} | ||
|
||
export default RecordProcessor |
Oops, something went wrong.