Skip to content

Commit

Permalink
Add build scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
nntrn committed Jun 25, 2024
1 parent a43598e commit c457bd7
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 0 deletions.
2 changes: 2 additions & 0 deletions scripts/markdown-toc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/usr/bin/env bash
ls -1 ./catalog/*.md | jq -Rrs 'split("\n")[]|select(length >0)|"* [\(.|split("/")|last)](\(.))"'
62 changes: 62 additions & 0 deletions scripts/update-catalogs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env bash

set -e

SCRIPT="$(realpath $0)"
DIR=${SCRIPT%/*/*}
OUTDIR=${OUTDIR:-$1}

DOCDIR=${DOCDIR:-$2}
CATALOG_URL='https://api.us.socrata.com/api/catalog/v1?only=dataset&limit=10000'
FORCE=${FORCE:-0}

_pids=()

if [[ -z $OUTDIR ]]; then
OUTDIR=$(mktemp -d)
fi

if [[ -z $DOCDIR ]]; then
DOCDIR=.
fi

mkdir -p $OUTDIR
mkdir -p $DOCDIR

download_data() {

local data_out="$OUTDIR/${1}.json"
local doc_out="$DOCDIR/${1}.md"

if [[ ! -f $data_out ]]; then
curl -s --create-dirs -o $data_out "$2" --fail
_pids+=("$!")
fi

echo "$doc_out"

local JQ_EXPR='include "views"; results|write_markdown("category")'

if [[ $3 == "domain" ]]; then
JQ_EXPR='include "views"; results|write_markdown("domain")'
fi

jq -L $DIR/scripts -r "$JQ_EXPR" $data_out >$doc_out
_pids+=("$!")
}

download_data texas-gov "${CATALOG_URL}&domains=data.texas.gov" category &
download_data texas "${CATALOG_URL}&q=texas" domain &
download_data austin "${CATALOG_URL}&domains=datahub.austintexas.gov" category &
download_data crime "${CATALOG_URL}&q=crime" domain &
download_data datasets "${CATALOG_URL}&q=datasets" domain &
download_data shootings "${CATALOG_URL}&q=shooting" domain &
download_data police "${CATALOG_URL}&q=police" domain &
download_data salaries "${CATALOG_URL}&q=salaries" domain &
download_data jobs "${CATALOG_URL}&q=jobs" domain &
download_data survey "${CATALOG_URL}&q=survey" domain &
download_data public-safety "${CATALOG_URL}&categories=public%20safety" category

wait "${_pids[@]}"

wait -n
121 changes: 121 additions & 0 deletions scripts/views.jq
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@


def clean_html:
gsub("&lt;";"<")
| gsub("&gt;";">")
| gsub("<[^>]*>";"")
| gsub("\\*";"")
| gsub("[[:^ascii:]]";"";"x")
| gsub("\n[\\t]";"\n - ")
| gsub("\\.[\\s]{2,}";".\n")
| gsub("[\\t]+";" ")
;

def clean_view: select(.assetType == "dataset") | del(.tableAuthor,.approvals,.grants,.owner);

def deep_flatten:
[ paths(scalars) as $path | {
"key": ($path|join("_")|gsub("_[0-9]+";"")|gsub(" ";"_")|ascii_downcase),
"value": (if ($path|last|type) == "number" then getpath($path[0:-1]) else getpath($path) end)
}
] | unique | from_entries;

def extract_desc:tostring|clean_html|split("\n")|map(select(test("[A-Z][a-z]")));

def neat_view:
(.description|extract_desc) as $desc
| {
name,
url: "https://\(.domaincname)/resource/\(.id).json",
meta_url: "https://\(.domaincname)/api/views/\(.id)",
category: (
.metadata_custom_fields_dataset_category_category_tile //
.category //
.metadata_custom_fields_ownership_department_name //
.metadata_custom_fields_microsite_tags
),
domain: .domaincname,
description: $desc,
summary:($desc|map(select(length < 400 and test("^[A-Z][a-z]")))|max_by(length)),
last_update: (.rowsupdatedat|strflocaltime("%b %Y")?)
};

def slugify($text):
$text
| tostring
| gsub("\\.";"")
| gsub("[^a-zA-Z0-9]{1,2}";"-")
| ascii_downcase;

def simple_clean_text:
gsub("(?<b>[\"\\(\\/]) ";.b)
|gsub("(?<a>[a-z])\"(?<b>[a-zA-Z])"; .a + "\" " + .b)
| gsub("(?<a>[a-z])\\.(?<b>[A-Z])"; .a + ". " + .b;"x")
| gsub("[\\s_]+";" ")
| gsub("^[\\s]+|[\\s]+$";"")
| gsub("\\[[^\\]]+";"")
;

def capitalize:
gsub("((?<c>[a-zA-Z])(?<a>[\\w\\d\\s]+))"; (.c|ascii_upcase) + .a;"x");

def title_from_filename:split("/")|last|split(".")|first|capitalize;

def format_title:
gsub("^[^a-zA-Z0-9]";"")
| gsub("^Strategic[_\\s]Measure[s\\s_-]+";"(%)";"x")
| gsub("^Strategic Direction";"($)")
| gsub("\\([^\\)]{30,}\\)|[\\[\\]]";"")
| simple_clean_text
;

def format_title($str): $str | format_title;

def write_markdown($groupby):
(input_filename|title_from_filename) as $filename
| map(select((.category|length)>0))
| map(select(.name|(test("[A-Z][a-z]";"x") and (test("DEMO|[Dd]emo|TEST|[Tt]est|ARCHIVE|[Aa]rchive|UTILITIES")|not) ) ))
| group_by(.["\($groupby)"])
| ([
"<details id=\"table-of-contents\"><summary><strong>Table of Contents</strong></summary>",
"",
map("- [\(.[0][$groupby])](#\(slugify(.[0][$groupby])))"),
"",
"</details></br>",
"",
"> **NOTE** ",
"> (%) denotes strategic dataset",
""
]|flatten|join("\n")) as $toc
| map(
"\n## \(.[0][$groupby])\n\n" + (
sort_by(.name)
| map(["- **\(.name)**","[Data](\(.url)) | [Meta](\(.meta_url)) | Last update: \(.last_update)",.summary]
| map(select(length > 0))|join(" \n "))|join("\n\n")
)
)
| flatten
| join("\n\n")| "# \($filename)\n\n\($toc)\n\n\(.)";

def write_markdown: write_markdown(.category);

def view:
map(clean_view)
| map(deep_flatten)
| map(neat_view);

def results:
.results
| map(
(.resource|del(.page_views,.columns_format,.columns_description,.columns_datatype,.columns_field_name,.columns_name)) +
(.classification.domain_metadata|from_entries|with_entries(.key|=(ascii_downcase|gsub("-";"_")) )) +
({
name: format_title(.resource.name),
domain_category: .classification.domain_category,
category: (.classification.domain_category// .strategic_area_strategic_direction_outcome //.ownership_department_name ),
domaincname: .metadata.domain,
rowsupdatedat: (.resource.updatedAt|gsub("\\..*";"")|strptime("%Y-%m-%dT%H:%M:%S")|mktime )
})
)
|map(neat_view)
;

0 comments on commit c457bd7

Please sign in to comment.