Merge branch 'main' of github.com:microsoft/CameraTraps into main

levyofi · Jul 6, 2022 · 6fa876b · 6fa876b
2 parents 6a865fc + 6925d0e
commit 6fa876b
Show file tree

Hide file tree

Showing 16 changed files with 1,693 additions and 227 deletions.
diff --git a/data_management/lila/get_lila_category_list.py b/data_management/lila/get_lila_category_list.py
@@ -1,7 +1,8 @@
 #
 # get_lila_category_list.py
 #
-# Example of making a text file listing all category names in specific LILA datasets
+# Generates a .json-formatted dictionary mapping each LILA dataset to all categories
+# that exist for that dataset, with counts for the number of occurrences of each category.
 #
 
 #%% Constants and imports
@@ -26,7 +27,7 @@
 use_all_datasets = True 
 
 # only need if restrict_category is false
-datasets_of_interest = [] 
+datasets_of_interest = []
 
 # We'll write images, metadata downloads, and temporary files here
 lila_local_base = r'g:\temp\lila'
@@ -37,7 +38,7 @@
 metadata_dir = os.path.join(lila_local_base,'metadata')
 os.makedirs(metadata_dir,exist_ok=True)
 
-output_file = os.path.join(output_dir,'category_list.txt')
+output_file = os.path.join(output_dir,'lila_dataset_to_categories.json')
 
 
 #%% Support functions
@@ -102,6 +103,9 @@ def unzip_file(input_file, output_folder=None):
 p = urlparse(metadata_url)
 metadata_filename = os.path.join(metadata_dir,os.path.basename(p.path))
 
+# Download the metadata file if necessary
+download_url(metadata_url,metadata_filename)
+
 # Read lines from the master metadata file
 with open(metadata_filename,'r') as f:
     metadata_lines = f.readlines()
@@ -124,7 +128,6 @@ def unzip_file(input_file, output_folder=None):
 
     # Create a separate entry for bounding boxes if they exist
     if len(tokens[3].strip()) > 0:
-        print('Adding bounding box dataset for {}'.format(ds_name))
         bbox_url_mapping = {'sas_url':tokens[1],'json_url':tokens[3]}
         metadata_table[tokens[0]+'_bbox'] = bbox_url_mapping
         assert 'https' in bbox_url_mapping['json_url']
@@ -133,10 +136,15 @@ def unzip_file(input_file, output_folder=None):
     assert 'https' in url_mapping['sas_url']
     assert 'https' in url_mapping['json_url']
 
+print('Read {} entries from the metadata file (including bboxes)'.format(len(metadata_table)))
+
 
 #%% Download and extract metadata for the datasets we're interested in
 
-if use_all_datasets: datasets_of_interest = list(metadata_table.keys())
+if use_all_datasets:
+
+    datasets_of_interest = list(metadata_table.keys())
+
 for ds_name in datasets_of_interest:
 
     assert ds_name in metadata_table
@@ -164,8 +172,14 @@ def unzip_file(input_file, output_folder=None):
 # ...for each dataset of interest
 
 
-#%% Get category names
+#%% Get category names for each dataset
+
+from collections import defaultdict
 
+dataset_to_categories = {}
+
+# ds_name = datasets_of_interest[0]
+# ds_name = 'NACTI'
 for ds_name in datasets_of_interest:
 
     print('Finding categories in {}'.format(ds_name))
@@ -182,19 +196,27 @@ def unzip_file(input_file, output_folder=None):
 
     # Collect list of categories and mappings to category name
     categories = data['categories']
-    category_ids = [c['id'] for c in categories]
+
+    category_id_to_count = defaultdict(int)
+    annotations = data['annotations']    
+
+    # ann = annotations[0]
+    for ann in annotations:
+        category_id_to_count[ann['category_id']] = category_id_to_count[ann['category_id']] + 1
+
+    # c = categories[0]
     for c in categories:
-        c['name'] = c['name'].lower()
-        category_id_to_name = {c['id']:c['name'] for c in categories}
+       count = category_id_to_count[c['id']] 
+       if 'count' in c:
+           assert 'bbox' in ds_name or c['count'] == count       
+       c['count'] = count
 
-    # Append category to categories_list
-    for category_id in category_ids:
-        category_name = category_id_to_name[category_id]
-        if category_name not in category_list: category_list.append(category_name)
+    dataset_to_categories[ds_name] = categories
+
+# ...for each dataset    
 
 
-#%% Save category names to file
+#%% Save dict
 
-with open(output_file, 'w') as txt_file:
-    for line in category_list:
-        txt_file.write(line + '\n')
+with open(output_file, 'w') as f:
+    json.dump(dataset_to_categories,f,indent=2)
diff --git a/detection/process_video.py b/detection/process_video.py
@@ -2,7 +2,7 @@
 #
 # process_video.py
 #
-# Split a video into frames, run the frames through run_tf_detector_batch.py, and
+# Split a video into frames, run the frames through run_detector_batch.py, and
 # optionally stitch together results into a new video with detection boxes.
 #
 ######
@@ -15,7 +15,7 @@
 import argparse
 import itertools
 
-from detection import run_tf_detector_batch
+from detection import run_detector_batch
 from visualization import visualize_detector_output
 from ct_utils import args_to_object
 from detection.video_utils import video_to_frames
@@ -80,16 +80,15 @@ def process_video(options):
             print('Loading results from {}'.format(options.output_json_file))
             results = None
     else:
-        results = run_tf_detector_batch.load_and_run_detector_batch(
+        results = run_detector_batch.load_and_run_detector_batch(
             options.model_file, image_file_names,
             confidence_threshold=options.json_confidence_threshold,
             n_cores=options.n_cores)
 
-        run_tf_detector_batch.write_results_to_file(
+        run_detector_batch.write_results_to_file(
             results, options.output_json_file,
             relative_path_base=frame_output_folder)
 
-
     if options.render_output_video:
 
         # Render detections to images
@@ -172,12 +171,12 @@ def process_video_folder(options):
             print('Loading results from {}'.format(frames_json))
             results = None
     else:
-        results = run_tf_detector_batch.load_and_run_detector_batch(
+        results = run_detector_batch.load_and_run_detector_batch(
             options.model_file, image_file_names,
             confidence_threshold=options.json_confidence_threshold,
             n_cores=options.n_cores)
 
-        run_tf_detector_batch.write_results_to_file(
+        run_detector_batch.write_results_to_file(
             results, frames_json,
             relative_path_base=frame_output_folder)
 

diff --git a/detection/video_utils.py b/detection/video_utils.py
@@ -339,9 +339,7 @@ def frame_results_to_video_results(input_file,output_file,options:FrameToVideoOp
     with open(output_file,'w') as f:
         f.write(s)
 
-    #%%
-
-
+
 #%% Test driver
 
 if False:

diff --git a/taxonomy_mapping/README.md b/taxonomy_mapping/README.md
@@ -1,25 +1,25 @@
-We generated a list of all the annotations in our universe; the scripts in this folder were used to (interactively) map them onto the GBIF and iNat taxonomies.
+## Mapping labels to a standard taxonomy (usually for new LILA datasets)
 
+When a new .json file comes in and needs to be mapped to scientific names...
 
-## Creating the Taxonomy CSV
+* Assuming this is a LILA dataset, edit the [LILA metadata file](http://lila.science/wp-content/uploads/2020/03/lila_sas_urls.txt) to include the new .json and dataset name.
 
-Creating the taxonomy CSV file requires running 3 scripts.
+* Assuming this is a LILA dataset, use get_lila_category_list.py to download the .json files for every LILA dataset.  This will produced a .json-formatted dictionary mapping each dataset to all of the categories it contains.
 
-1. Generate a spreadsheet of the class names within each desired dataset by querying MegaDB. These class names are the names provided directly by our partner organizations and may include abbreviations, e.g., "wtd" meaning "white-tailed deer."
+* Use map_new_lila_datasets.py to create a .csv file mapping each of those categories to a scientific name and taxonomy.  This will eventually become a subset of rows in the "master" .csv file.  This is a semi-automated process; it will look up common names against the iNat and GBIF taxonomies, with some heuristics to avoid simple problems (like making sure that "greater_kudu" matches "greater kudu", or that "black backed jackal" matches "black-backed jackal"), but you will need to fill in a few gaps manually.  I do this with three windows open: a .csv editor, Spyder (with the cell called "manual lookup" from this script open), and a browser.  Once you generate this .csv file, it's considered permanent, i.e., the cell that wrote it won't re-write it, so manually edit to your heart's content.
 
-    This is done by running the `taxonomy_mapping/species_by_dataset.py` script. The first time running this step may take a while. However, intermediary outputs are cached in JSON files for much faster future runs.
+* Use preview_lila_taxonomy.py to produce an HTML file full of images that you can use to make sure that the matches were sensible; be particularly suspicious of anything that doesn't look like a mammal, bird, or reptile.  Go back and fix things in the .csv file.  This script/notebook also does a bunch of other consistency checking.
 
-2. Because each partner organization uses their own naming scheme, we need to map the class names onto a common taxonomy. We use a combination of the [iNaturalist taxonomy](https://forum.inaturalist.org/t/how-to-download-taxa/3542) and the [Global Biodiversity Information Facility (GBIF) Backbone Taxonomy](https://www.gbif.org/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c).
+* When you are totally satisfied with that .csv file, manually append it to the "master" .csv file (lila-taxonomy-mapping.csv), which is currently in a private repository.  preview_lila_taxonomy can also be run against the master file.
 
-    This is done by running the `taxonomy_mapping/process_species_by_dataset.py` script. Note that this script is not meant to be run as a normal Python script but is instead intended to be run interactively.
+* Check for errors (one more time) (this should be redundant with what's now included in preview_lila_taxonomy.py, but it can't hurt) by running:
 
-3. Once the taxonomy CSV is generated, check for errors by running
+    ```bash
+    python taxonomy_mapping/taxonomy_csv_checker.py /path/to/taxonomy.csv
+    ```
 
-```bash
-python taxonomy_mapping/taxonomy_csv_checker.py /path/to/taxonomy.csv
-```
+* Prepare the "release" taxonomy file (which removes a couple columns and removes unused rows) using prepare_lila_taxonomy_release.py .
 
+* Use map_lila_categories.py to get a mapping of every LILA data set to the common taxonomy.
 
-## Visualize the Taxonomy Hierarchy
-
-The `visualize_taxonomy.ipynb` notebook demonstrates how to visualize the taxonomy hierarchy. It requires the *networkx* and *graphviz* Python packages.
+* The `visualize_taxonomy.ipynb` notebook demonstrates how to visualize the taxonomy hierarchy. It requires the *networkx* and *graphviz* Python packages.
diff --git a/taxonomy_mapping/map_lila_categories.py b/taxonomy_mapping/map_lila_categories.py
@@ -0,0 +1,69 @@
+#
+# Using the taxonomy .csv file, map all LILA datasets to the standard taxonomy 
+#
+
+#%% Constants and imports
+
+import json
+import os
+
+import pandas as pd
+
+# Created by get_lila_category_list.py... contains counts for each category
+lila_dataset_to_categories_file = r"G:\temp\lila\lila_categories_list\lila_dataset_to_categories.json"
+lila_taxonomy_file = r"G:\temp\lila\lila-taxonomy-mapping_release.22.07.03.1608.csv"
+
+assert os.path.isfile(lila_dataset_to_categories_file)
+assert os.path.isfile(lila_taxonomy_file)
+
+
+#%% Load category and taxonomy files
+
+with open(lila_dataset_to_categories_file,'r') as f:
+    lila_dataset_to_categories = json.load(f)
+
+taxonomy_df = pd.read_csv(lila_taxonomy_file)
+
+
+#%% Map dataset names and category names to scientific names
+
+ds_query_to_scientific_name = {}
+
+unmapped_queries = set()
+
+# i_row = 1; row = taxonomy_df.iloc[i_row]; row
+for i_row,row in taxonomy_df.iterrows():
+
+    ds_query = row['dataset_name'] + ':' + row['query']
+    ds_query = ds_query.lower()
+
+    if not isinstance(row['scientific_name'],str):
+        unmapped_queries.add(ds_query)
+        ds_query_to_scientific_name[ds_query] = 'unmapped'
+        continue
+
+    ds_query_to_scientific_name[ds_query] = row['scientific_name']
+
+
+#%% For each dataset, make sure we can map every category to the taxonomy
+
+# dataset_name = list(lila_dataset_to_categories.keys())[0]
+for _dataset_name in lila_dataset_to_categories.keys():
+
+    if '_bbox' in _dataset_name:
+        dataset_name = _dataset_name.replace('_bbox','')
+    else:
+        dataset_name = _dataset_name
+
+    categories = lila_dataset_to_categories[dataset_name]
+
+    # c = categories[0]
+    for c in categories:
+        ds_query = dataset_name + ':' + c['name']
+        ds_query = ds_query.lower()
+
+        if ds_query not in ds_query_to_scientific_name:
+            print('Could not find mapping for {}'.format(ds_query))            
+        else:
+            scientific_name = ds_query_to_scientific_name[ds_query]
+