From 39dc8605a3b252a3c75ffdfa10163bb5f1dfa22e Mon Sep 17 00:00:00 2001 From: Mario Kahlhofer Date: Thu, 26 Nov 2020 22:05:33 +0100 Subject: [PATCH] #4 #5 Fill out building and capacity information for cases where the scraper fails --- src/scraper/main.ts | 7 ++- src/scraper/resources/buildings.json | 7 --- .../resources/{ignore.json => extra.json} | 8 +++- src/scraper/scraper.ts | 48 ++++++++++++++++++- src/scraper/types.ts | 9 ++++ webpack.config.js | 18 ++++--- 6 files changed, 75 insertions(+), 22 deletions(-) delete mode 100644 src/scraper/resources/buildings.json rename src/scraper/resources/{ignore.json => extra.json} (72%) diff --git a/src/scraper/main.ts b/src/scraper/main.ts index 00fa6c3..1ab0c35 100644 --- a/src/scraper/main.ts +++ b/src/scraper/main.ts @@ -3,7 +3,7 @@ import { writeFile } from "fs"; import { Log } from "./log"; import { Scraper } from "./scraper"; import { IndexDto } from "../common/dto"; -import { BuildingToRoomsMap } from "./types"; +import { BuildingToRoomsMap, RoomToCapacityMap } from "./types"; /** The full base URL to the kusss instance */ declare let KUSSS_URL: string; @@ -23,6 +23,8 @@ declare let REQUEST_DELAY_MS: number; declare let IGNORE_ROOMS: string[]; /** Provides manual metadata for room and building mappings */ declare let EXTRA_BUILDING_METADATA: BuildingToRoomsMap; +/** Provides manual metadata for room and building mappings */ +declare let EXTRA_CAPACITY_METADATA: RoomToCapacityMap; Log.info("initializing scraper"); @@ -41,7 +43,8 @@ const scraper = new Scraper( MAX_RETRIES, REQUEST_DELAY_MS, IGNORE_ROOMS, - EXTRA_BUILDING_METADATA + EXTRA_BUILDING_METADATA, + EXTRA_CAPACITY_METADATA ); scraper diff --git a/src/scraper/resources/buildings.json b/src/scraper/resources/buildings.json deleted file mode 100644 index ea81004..0000000 --- a/src/scraper/resources/buildings.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "buildings": { - "Evangelical Student Dormitory (ESH)": ["ESH 1", "ESH 2", "ESH 3"], - "Johannes Kepler Dormitory (JKH)": ["KEP 1", "KEP 3"], - "Franz Jägerstätter Dormitory (KHG)": ["KHG I"] - } -} diff --git a/src/scraper/resources/ignore.json b/src/scraper/resources/extra.json similarity index 72% rename from src/scraper/resources/ignore.json rename to src/scraper/resources/extra.json index 4faaf5b..8b119c9 100644 --- a/src/scraper/resources/ignore.json +++ b/src/scraper/resources/extra.json @@ -1,5 +1,11 @@ { - "rooms": [ + "buildings": { + "Evangelical Student Dormitory (ESH)": ["ESH 1", "ESH 2", "ESH 3"], + "Johannes Kepler Dormitory (JKH)": ["KEP 1", "KEP 3"], + "Franz Jägerstätter Dormitory (KHG)": ["KHG I"] + }, + "capacities": {}, + "ignore": [ "deadline", "digital", "div.", diff --git a/src/scraper/scraper.ts b/src/scraper/scraper.ts index 315b899..a1da806 100644 --- a/src/scraper/scraper.ts +++ b/src/scraper/scraper.ts @@ -29,6 +29,7 @@ import { ScrapeStatistics, BuildingToRoomsMap as BuildingToRooms, BookingScrape, + RoomToCapacityMap, } from "./types"; import { KusssRoomScraper, JkuRoomScraper } from "./components/rooms"; import { @@ -62,6 +63,7 @@ export class Scraper { private readonly quickMode: boolean; private readonly ignoreRooms: string[]; private readonly extraBuildingMeta: BuildingToRooms; + private readonly extraCapacityMeta: RoomToCapacityMap; private readonly requestLimiter: Bottleneck; private readonly requestOptions: OptionsOfTextResponseBody; @@ -81,13 +83,15 @@ export class Scraper { maxRetries = 5, requestDelay = 500, ignoreRooms: string[] = [], - extraBuildingMeta: BuildingToRooms = {} + extraBuildingMeta: BuildingToRooms = {}, + extraCapacityMeta: RoomToCapacityMap = {} ) { this.quickMode = quickMode; this.jkuUrl = jkuUrl; this.kusssUrl = kusssUrl; this.ignoreRooms = ignoreRooms.map(Scraper.getCncnlName); this.extraBuildingMeta = extraBuildingMeta; + this.extraCapacityMeta = extraCapacityMeta; // initialize request configuration and statistics object this.requestOptions = { @@ -103,6 +107,8 @@ export class Scraper { nScrapedBookings: 0, nIgnoredBookings: 0, nScrapedCourses: 0, + nExtraBuildings: 0, + nExtraRooms: 0, nDays: 0, nRequests: 0, nScrapedKusssRooms: 0, @@ -171,6 +177,30 @@ export class Scraper { if (this.quickMode && i > 5) break; } + /* add extra building and room metadata if necessary */ + + for (const building in this.extraBuildingMeta) { + // add additional buildings that might not already exist + if (!(building in buildingToId)) { + this.statistics.nExtraBuildings += 1; + + const id = bid++; + result.buildings[id] = { name: building }; + buildingToId[building] = id; + } + + // add additional room metadata + for (const room of this.extraBuildingMeta[building]) { + this.statistics.nExtraRooms += 1; + jRooms[Scraper.getCncnlName(room)] = { + name: room, + buildingId: buildingToId[building], + capacity: this.extraCapacityMeta[room] ?? -1, + }; + } + } + + this.logExtraBuildingMetrics(); this.logJkuRoomMetrics(jRooms); Log.sectionmark(); @@ -387,12 +417,26 @@ export class Scraper { const numRooms = Object.keys(rooms).length; Log.milestone( "room", - `scraped ${numRooms} rooms from the JKU homepage`, + `scraped ${numRooms} rooms from the JKU homepage or extraneous resource files`, numRooms ); Log.obj(Object.values(rooms).map((r) => r.name)); } + private logExtraBuildingMetrics(): void { + if ( + this.statistics.nExtraBuildings > 0 || + this.statistics.nExtraRooms > 0 + ) { + Log.scrape( + "room", + `appended ${this.statistics.nExtraBuildings} extra buildings and ${this.statistics.nExtraRooms} room mappings`, + 1 + ); + Log.obj(this.extraBuildingMeta); + } + } + private logMergedRoomMetrics(rooms: RoomsDto): void { const missing = Object.values(rooms).filter( (r) => r.building === -1 || r.capacity === -1 diff --git a/src/scraper/types.ts b/src/scraper/types.ts index c9a239a..3e84237 100644 --- a/src/scraper/types.ts +++ b/src/scraper/types.ts @@ -55,6 +55,8 @@ export declare interface BookingScrape { export declare interface ScrapeStatistics { nRequests: number; nScrapedBuildings: number; + nExtraBuildings: number; + nExtraRooms: number; nScrapedKusssRooms: number; nScrapedJkuRooms: number; nScrapedCourses: number; @@ -76,6 +78,13 @@ export declare interface BuildingToRoomsMap { [building: string]: string[]; } +/** + * Provides additional capacity data for rooms that are lacking that + */ +export declare interface RoomToCapacityMap { + [room: string]: number; +} + export const SEARCH_PAGE = "/kusss/coursecatalogue-start.action?advanced=true"; export const SEARCH_RESULTS = "/kusss/coursecatalogue-search-lvas.action?sortParam0courses=lvaName&asccourses=true" + diff --git a/webpack.config.js b/webpack.config.js index 70cae45..29f2b62 100644 --- a/webpack.config.js +++ b/webpack.config.js @@ -96,6 +96,11 @@ const appConfig = (env, options) => { }; }; +// a file with some hand-coded extra resources necessary for scraping +const extraResources = JSON.parse( + fs.readFileSync("./src/scraper/resources/extra.json") +); + const scraperConfig = (env, options) => { return { target: "node", @@ -131,16 +136,9 @@ const scraperConfig = (env, options) => { options.mode !== "production" ? JSON.stringify(500) : JSON.stringify(1), - IGNORE_ROOMS: JSON.stringify( - JSON.parse(fs.readFileSync("./src/scraper/resources/ignore.json"))[ - "rooms" - ] - ), - EXTRA_BUILDING_METADATA: JSON.stringify( - JSON.parse(fs.readFileSync("./src/scraper/resources/buildings.json"))[ - "buildings" - ] - ), + IGNORE_ROOMS: JSON.stringify(extraResources["ignore"]), + EXTRA_BUILDING_METADATA: JSON.stringify(extraResources["buildings"]), + EXTRA_CAPACITY_METADATA: JSON.stringify(extraResources["capacities"]), }), ], };