Skip to content

Commit

Permalink
#4 #5 Fill out building and capacity information for cases where the …
Browse files Browse the repository at this point in the history
…scraper fails
  • Loading branch information
blu3r4y committed Nov 26, 2020
1 parent 8facc70 commit 39dc860
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 22 deletions.
7 changes: 5 additions & 2 deletions src/scraper/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { writeFile } from "fs";
import { Log } from "./log";
import { Scraper } from "./scraper";
import { IndexDto } from "../common/dto";
import { BuildingToRoomsMap } from "./types";
import { BuildingToRoomsMap, RoomToCapacityMap } from "./types";

/** The full base URL to the kusss instance */
declare let KUSSS_URL: string;
Expand All @@ -23,6 +23,8 @@ declare let REQUEST_DELAY_MS: number;
declare let IGNORE_ROOMS: string[];
/** Provides manual metadata for room and building mappings */
declare let EXTRA_BUILDING_METADATA: BuildingToRoomsMap;
/** Provides manual metadata for room and building mappings */
declare let EXTRA_CAPACITY_METADATA: RoomToCapacityMap;

Log.info("initializing scraper");

Expand All @@ -41,7 +43,8 @@ const scraper = new Scraper(
MAX_RETRIES,
REQUEST_DELAY_MS,
IGNORE_ROOMS,
EXTRA_BUILDING_METADATA
EXTRA_BUILDING_METADATA,
EXTRA_CAPACITY_METADATA
);

scraper
Expand Down
7 changes: 0 additions & 7 deletions src/scraper/resources/buildings.json

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
{
"rooms": [
"buildings": {
"Evangelical Student Dormitory (ESH)": ["ESH 1", "ESH 2", "ESH 3"],
"Johannes Kepler Dormitory (JKH)": ["KEP 1", "KEP 3"],
"Franz Jägerstätter Dormitory (KHG)": ["KHG I"]
},
"capacities": {},
"ignore": [
"deadline",
"digital",
"div.",
Expand Down
48 changes: 46 additions & 2 deletions src/scraper/scraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import {
ScrapeStatistics,
BuildingToRoomsMap as BuildingToRooms,
BookingScrape,
RoomToCapacityMap,
} from "./types";
import { KusssRoomScraper, JkuRoomScraper } from "./components/rooms";
import {
Expand Down Expand Up @@ -62,6 +63,7 @@ export class Scraper {
private readonly quickMode: boolean;
private readonly ignoreRooms: string[];
private readonly extraBuildingMeta: BuildingToRooms;
private readonly extraCapacityMeta: RoomToCapacityMap;
private readonly requestLimiter: Bottleneck;
private readonly requestOptions: OptionsOfTextResponseBody;

Expand All @@ -81,13 +83,15 @@ export class Scraper {
maxRetries = 5,
requestDelay = 500,
ignoreRooms: string[] = [],
extraBuildingMeta: BuildingToRooms = {}
extraBuildingMeta: BuildingToRooms = {},
extraCapacityMeta: RoomToCapacityMap = {}
) {
this.quickMode = quickMode;
this.jkuUrl = jkuUrl;
this.kusssUrl = kusssUrl;
this.ignoreRooms = ignoreRooms.map(Scraper.getCncnlName);
this.extraBuildingMeta = extraBuildingMeta;
this.extraCapacityMeta = extraCapacityMeta;

// initialize request configuration and statistics object
this.requestOptions = {
Expand All @@ -103,6 +107,8 @@ export class Scraper {
nScrapedBookings: 0,
nIgnoredBookings: 0,
nScrapedCourses: 0,
nExtraBuildings: 0,
nExtraRooms: 0,
nDays: 0,
nRequests: 0,
nScrapedKusssRooms: 0,
Expand Down Expand Up @@ -171,6 +177,30 @@ export class Scraper {
if (this.quickMode && i > 5) break;
}

/* add extra building and room metadata if necessary */

for (const building in this.extraBuildingMeta) {
// add additional buildings that might not already exist
if (!(building in buildingToId)) {
this.statistics.nExtraBuildings += 1;

const id = bid++;
result.buildings[id] = { name: building };
buildingToId[building] = id;
}

// add additional room metadata
for (const room of this.extraBuildingMeta[building]) {
this.statistics.nExtraRooms += 1;
jRooms[Scraper.getCncnlName(room)] = {
name: room,
buildingId: buildingToId[building],
capacity: this.extraCapacityMeta[room] ?? -1,
};
}
}

this.logExtraBuildingMetrics();
this.logJkuRoomMetrics(jRooms);
Log.sectionmark();

Expand Down Expand Up @@ -387,12 +417,26 @@ export class Scraper {
const numRooms = Object.keys(rooms).length;
Log.milestone(
"room",
`scraped ${numRooms} rooms from the JKU homepage`,
`scraped ${numRooms} rooms from the JKU homepage or extraneous resource files`,
numRooms
);
Log.obj(Object.values(rooms).map((r) => r.name));
}

private logExtraBuildingMetrics(): void {
if (
this.statistics.nExtraBuildings > 0 ||
this.statistics.nExtraRooms > 0
) {
Log.scrape(
"room",
`appended ${this.statistics.nExtraBuildings} extra buildings and ${this.statistics.nExtraRooms} room mappings`,
1
);
Log.obj(this.extraBuildingMeta);
}
}

private logMergedRoomMetrics(rooms: RoomsDto): void {
const missing = Object.values(rooms).filter(
(r) => r.building === -1 || r.capacity === -1
Expand Down
9 changes: 9 additions & 0 deletions src/scraper/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ export declare interface BookingScrape {
export declare interface ScrapeStatistics {
nRequests: number;
nScrapedBuildings: number;
nExtraBuildings: number;
nExtraRooms: number;
nScrapedKusssRooms: number;
nScrapedJkuRooms: number;
nScrapedCourses: number;
Expand All @@ -76,6 +78,13 @@ export declare interface BuildingToRoomsMap {
[building: string]: string[];
}

/**
* Provides additional capacity data for rooms that are lacking that
*/
export declare interface RoomToCapacityMap {
[room: string]: number;
}

export const SEARCH_PAGE = "/kusss/coursecatalogue-start.action?advanced=true";
export const SEARCH_RESULTS =
"/kusss/coursecatalogue-search-lvas.action?sortParam0courses=lvaName&asccourses=true" +
Expand Down
18 changes: 8 additions & 10 deletions webpack.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ const appConfig = (env, options) => {
};
};

// a file with some hand-coded extra resources necessary for scraping
const extraResources = JSON.parse(
fs.readFileSync("./src/scraper/resources/extra.json")
);

const scraperConfig = (env, options) => {
return {
target: "node",
Expand Down Expand Up @@ -131,16 +136,9 @@ const scraperConfig = (env, options) => {
options.mode !== "production"
? JSON.stringify(500)
: JSON.stringify(1),
IGNORE_ROOMS: JSON.stringify(
JSON.parse(fs.readFileSync("./src/scraper/resources/ignore.json"))[
"rooms"
]
),
EXTRA_BUILDING_METADATA: JSON.stringify(
JSON.parse(fs.readFileSync("./src/scraper/resources/buildings.json"))[
"buildings"
]
),
IGNORE_ROOMS: JSON.stringify(extraResources["ignore"]),
EXTRA_BUILDING_METADATA: JSON.stringify(extraResources["buildings"]),
EXTRA_CAPACITY_METADATA: JSON.stringify(extraResources["capacities"]),
}),
],
};
Expand Down

0 comments on commit 39dc860

Please sign in to comment.