Skip to content

Commit

Permalink
Merge branch 'main' of github.com:microsoft/CameraTraps into main
Browse files Browse the repository at this point in the history
  • Loading branch information
agentmorris committed Jul 6, 2022
2 parents 6a865fc + 6925d0e commit 6fa876b
Show file tree
Hide file tree
Showing 16 changed files with 1,693 additions and 227 deletions.
56 changes: 39 additions & 17 deletions data_management/lila/get_lila_category_list.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#
# get_lila_category_list.py
#
# Example of making a text file listing all category names in specific LILA datasets
# Generates a .json-formatted dictionary mapping each LILA dataset to all categories
# that exist for that dataset, with counts for the number of occurrences of each category.
#

#%% Constants and imports
Expand All @@ -26,7 +27,7 @@
use_all_datasets = True

# only need if restrict_category is false
datasets_of_interest = []
datasets_of_interest = []

# We'll write images, metadata downloads, and temporary files here
lila_local_base = r'g:\temp\lila'
Expand All @@ -37,7 +38,7 @@
metadata_dir = os.path.join(lila_local_base,'metadata')
os.makedirs(metadata_dir,exist_ok=True)

output_file = os.path.join(output_dir,'category_list.txt')
output_file = os.path.join(output_dir,'lila_dataset_to_categories.json')


#%% Support functions
Expand Down Expand Up @@ -102,6 +103,9 @@ def unzip_file(input_file, output_folder=None):
p = urlparse(metadata_url)
metadata_filename = os.path.join(metadata_dir,os.path.basename(p.path))

# Download the metadata file if necessary
download_url(metadata_url,metadata_filename)

# Read lines from the master metadata file
with open(metadata_filename,'r') as f:
metadata_lines = f.readlines()
Expand All @@ -124,7 +128,6 @@ def unzip_file(input_file, output_folder=None):

# Create a separate entry for bounding boxes if they exist
if len(tokens[3].strip()) > 0:
print('Adding bounding box dataset for {}'.format(ds_name))
bbox_url_mapping = {'sas_url':tokens[1],'json_url':tokens[3]}
metadata_table[tokens[0]+'_bbox'] = bbox_url_mapping
assert 'https' in bbox_url_mapping['json_url']
Expand All @@ -133,10 +136,15 @@ def unzip_file(input_file, output_folder=None):
assert 'https' in url_mapping['sas_url']
assert 'https' in url_mapping['json_url']

print('Read {} entries from the metadata file (including bboxes)'.format(len(metadata_table)))


#%% Download and extract metadata for the datasets we're interested in

if use_all_datasets: datasets_of_interest = list(metadata_table.keys())
if use_all_datasets:

datasets_of_interest = list(metadata_table.keys())

for ds_name in datasets_of_interest:

assert ds_name in metadata_table
Expand Down Expand Up @@ -164,8 +172,14 @@ def unzip_file(input_file, output_folder=None):
# ...for each dataset of interest


#%% Get category names
#%% Get category names for each dataset

from collections import defaultdict

dataset_to_categories = {}

# ds_name = datasets_of_interest[0]
# ds_name = 'NACTI'
for ds_name in datasets_of_interest:

print('Finding categories in {}'.format(ds_name))
Expand All @@ -182,19 +196,27 @@ def unzip_file(input_file, output_folder=None):

# Collect list of categories and mappings to category name
categories = data['categories']
category_ids = [c['id'] for c in categories]

category_id_to_count = defaultdict(int)
annotations = data['annotations']

# ann = annotations[0]
for ann in annotations:
category_id_to_count[ann['category_id']] = category_id_to_count[ann['category_id']] + 1

# c = categories[0]
for c in categories:
c['name'] = c['name'].lower()
category_id_to_name = {c['id']:c['name'] for c in categories}
count = category_id_to_count[c['id']]
if 'count' in c:
assert 'bbox' in ds_name or c['count'] == count
c['count'] = count

# Append category to categories_list
for category_id in category_ids:
category_name = category_id_to_name[category_id]
if category_name not in category_list: category_list.append(category_name)
dataset_to_categories[ds_name] = categories

# ...for each dataset


#%% Save category names to file
#%% Save dict

with open(output_file, 'w') as txt_file:
for line in category_list:
txt_file.write(line + '\n')
with open(output_file, 'w') as f:
json.dump(dataset_to_categories,f,indent=2)
13 changes: 6 additions & 7 deletions detection/process_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# process_video.py
#
# Split a video into frames, run the frames through run_tf_detector_batch.py, and
# Split a video into frames, run the frames through run_detector_batch.py, and
# optionally stitch together results into a new video with detection boxes.
#
######
Expand All @@ -15,7 +15,7 @@
import argparse
import itertools

from detection import run_tf_detector_batch
from detection import run_detector_batch
from visualization import visualize_detector_output
from ct_utils import args_to_object
from detection.video_utils import video_to_frames
Expand Down Expand Up @@ -80,16 +80,15 @@ def process_video(options):
print('Loading results from {}'.format(options.output_json_file))
results = None
else:
results = run_tf_detector_batch.load_and_run_detector_batch(
results = run_detector_batch.load_and_run_detector_batch(
options.model_file, image_file_names,
confidence_threshold=options.json_confidence_threshold,
n_cores=options.n_cores)

run_tf_detector_batch.write_results_to_file(
run_detector_batch.write_results_to_file(
results, options.output_json_file,
relative_path_base=frame_output_folder)


if options.render_output_video:

# Render detections to images
Expand Down Expand Up @@ -172,12 +171,12 @@ def process_video_folder(options):
print('Loading results from {}'.format(frames_json))
results = None
else:
results = run_tf_detector_batch.load_and_run_detector_batch(
results = run_detector_batch.load_and_run_detector_batch(
options.model_file, image_file_names,
confidence_threshold=options.json_confidence_threshold,
n_cores=options.n_cores)

run_tf_detector_batch.write_results_to_file(
run_detector_batch.write_results_to_file(
results, frames_json,
relative_path_base=frame_output_folder)

Expand Down
4 changes: 1 addition & 3 deletions detection/video_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,9 +339,7 @@ def frame_results_to_video_results(input_file,output_file,options:FrameToVideoOp
with open(output_file,'w') as f:
f.write(s)

#%%



#%% Test driver

if False:
Expand Down
28 changes: 14 additions & 14 deletions taxonomy_mapping/README.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
We generated a list of all the annotations in our universe; the scripts in this folder were used to (interactively) map them onto the GBIF and iNat taxonomies.
## Mapping labels to a standard taxonomy (usually for new LILA datasets)

When a new .json file comes in and needs to be mapped to scientific names...

## Creating the Taxonomy CSV
* Assuming this is a LILA dataset, edit the [LILA metadata file](http://lila.science/wp-content/uploads/2020/03/lila_sas_urls.txt) to include the new .json and dataset name.

Creating the taxonomy CSV file requires running 3 scripts.
* Assuming this is a LILA dataset, use get_lila_category_list.py to download the .json files for every LILA dataset. This will produced a .json-formatted dictionary mapping each dataset to all of the categories it contains.

1. Generate a spreadsheet of the class names within each desired dataset by querying MegaDB. These class names are the names provided directly by our partner organizations and may include abbreviations, e.g., "wtd" meaning "white-tailed deer."
* Use map_new_lila_datasets.py to create a .csv file mapping each of those categories to a scientific name and taxonomy. This will eventually become a subset of rows in the "master" .csv file. This is a semi-automated process; it will look up common names against the iNat and GBIF taxonomies, with some heuristics to avoid simple problems (like making sure that "greater_kudu" matches "greater kudu", or that "black backed jackal" matches "black-backed jackal"), but you will need to fill in a few gaps manually. I do this with three windows open: a .csv editor, Spyder (with the cell called "manual lookup" from this script open), and a browser. Once you generate this .csv file, it's considered permanent, i.e., the cell that wrote it won't re-write it, so manually edit to your heart's content.

This is done by running the `taxonomy_mapping/species_by_dataset.py` script. The first time running this step may take a while. However, intermediary outputs are cached in JSON files for much faster future runs.
* Use preview_lila_taxonomy.py to produce an HTML file full of images that you can use to make sure that the matches were sensible; be particularly suspicious of anything that doesn't look like a mammal, bird, or reptile. Go back and fix things in the .csv file. This script/notebook also does a bunch of other consistency checking.

2. Because each partner organization uses their own naming scheme, we need to map the class names onto a common taxonomy. We use a combination of the [iNaturalist taxonomy](https://forum.inaturalist.org/t/how-to-download-taxa/3542) and the [Global Biodiversity Information Facility (GBIF) Backbone Taxonomy](https://www.gbif.org/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c).
* When you are totally satisfied with that .csv file, manually append it to the "master" .csv file (lila-taxonomy-mapping.csv), which is currently in a private repository. preview_lila_taxonomy can also be run against the master file.

This is done by running the `taxonomy_mapping/process_species_by_dataset.py` script. Note that this script is not meant to be run as a normal Python script but is instead intended to be run interactively.
* Check for errors (one more time) (this should be redundant with what's now included in preview_lila_taxonomy.py, but it can't hurt) by running:

3. Once the taxonomy CSV is generated, check for errors by running
```bash
python taxonomy_mapping/taxonomy_csv_checker.py /path/to/taxonomy.csv
```

```bash
python taxonomy_mapping/taxonomy_csv_checker.py /path/to/taxonomy.csv
```
* Prepare the "release" taxonomy file (which removes a couple columns and removes unused rows) using prepare_lila_taxonomy_release.py .

* Use map_lila_categories.py to get a mapping of every LILA data set to the common taxonomy.

## Visualize the Taxonomy Hierarchy

The `visualize_taxonomy.ipynb` notebook demonstrates how to visualize the taxonomy hierarchy. It requires the *networkx* and *graphviz* Python packages.
* The `visualize_taxonomy.ipynb` notebook demonstrates how to visualize the taxonomy hierarchy. It requires the *networkx* and *graphviz* Python packages.
69 changes: 69 additions & 0 deletions taxonomy_mapping/map_lila_categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#
# Using the taxonomy .csv file, map all LILA datasets to the standard taxonomy
#

#%% Constants and imports

import json
import os

import pandas as pd

# Created by get_lila_category_list.py... contains counts for each category
lila_dataset_to_categories_file = r"G:\temp\lila\lila_categories_list\lila_dataset_to_categories.json"
lila_taxonomy_file = r"G:\temp\lila\lila-taxonomy-mapping_release.22.07.03.1608.csv"

assert os.path.isfile(lila_dataset_to_categories_file)
assert os.path.isfile(lila_taxonomy_file)


#%% Load category and taxonomy files

with open(lila_dataset_to_categories_file,'r') as f:
lila_dataset_to_categories = json.load(f)

taxonomy_df = pd.read_csv(lila_taxonomy_file)


#%% Map dataset names and category names to scientific names

ds_query_to_scientific_name = {}

unmapped_queries = set()

# i_row = 1; row = taxonomy_df.iloc[i_row]; row
for i_row,row in taxonomy_df.iterrows():

ds_query = row['dataset_name'] + ':' + row['query']
ds_query = ds_query.lower()

if not isinstance(row['scientific_name'],str):
unmapped_queries.add(ds_query)
ds_query_to_scientific_name[ds_query] = 'unmapped'
continue

ds_query_to_scientific_name[ds_query] = row['scientific_name']


#%% For each dataset, make sure we can map every category to the taxonomy

# dataset_name = list(lila_dataset_to_categories.keys())[0]
for _dataset_name in lila_dataset_to_categories.keys():

if '_bbox' in _dataset_name:
dataset_name = _dataset_name.replace('_bbox','')
else:
dataset_name = _dataset_name

categories = lila_dataset_to_categories[dataset_name]

# c = categories[0]
for c in categories:
ds_query = dataset_name + ':' + c['name']
ds_query = ds_query.lower()

if ds_query not in ds_query_to_scientific_name:
print('Could not find mapping for {}'.format(ds_query))
else:
scientific_name = ds_query_to_scientific_name[ds_query]

Loading

0 comments on commit 6fa876b

Please sign in to comment.