Skip to content

Commit

Permalink
upload and download scopes via huggingface
Browse files Browse the repository at this point in the history
  • Loading branch information
enjalot committed Nov 17, 2024
1 parent 53704a0 commit 75f26c8
Show file tree
Hide file tree
Showing 20 changed files with 825 additions and 339 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ testenv-prod
.env
dist
build
*.egg-info
*.egg-info
*.un~
99 changes: 99 additions & 0 deletions latentscope/scripts/download_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
Usage:
python latentscope/scripts/download_dataset.py "enjalot/ls-datavis-misunderstood" ~/latent-scope-data/datavis-misunderstood
"""

from datasets import load_dataset
from huggingface_hub import hf_hub_download, snapshot_download
from pathlib import Path
import argparse
import os
from latentscope.util import get_key

def download_from_huggingface(dataset_repo, dataset_name,output_dir,token=None):
"""
Download a latentscope dataset from Hugging Face.
Args:
dataset_path (str): Path to the dataset on Hugging Face (e.g., 'username/dataset-name')
output_dir (str): Local directory to save the downloaded files
token (str, optional): Hugging Face API token
"""
# Get token from .env if not provided
if not token:
token = get_key("HUGGINGFACE_TOKEN")

output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)

try:
# Download the latentscope directory into output_dir/dataset_name
latentscope_path = Path(output_dir) / dataset_name
# if the directory already exists, raise an error
if latentscope_path.exists():
# raise Exception(f"Directory {latentscope_path} already exists")
print(f"Warning: directory {latentscope_path} already exists")
latentscope_path.mkdir(parents=True, exist_ok=True)

# Download all files from the latentscope directory
snapshot_download(
repo_id=dataset_repo,
repo_type="dataset",
local_dir=str(latentscope_path),
token=token,
local_dir_use_symlinks=False
)

# Delete data and readme directories if they exist
data_path = latentscope_path / "data"
readme_path = latentscope_path / "README.md"
if data_path.exists():
import shutil
shutil.rmtree(data_path)
if readme_path.exists():
os.remove(readme_path)
# Move contents of latentscope directory up one level
latentscope_dir = latentscope_path / "latentscope"
if latentscope_dir.exists():
import shutil
# Move all contents up one level
for item in latentscope_dir.iterdir():
dest = latentscope_path / item.name
if dest.exists():
if dest.is_dir():
shutil.rmtree(dest)
else:
os.remove(dest)
shutil.move(str(item), str(dest))
# Remove the now empty latentscope directory
latentscope_dir.rmdir()

# Update meta.json with new dataset_name
meta_path = latentscope_path / "meta.json"
if meta_path.exists():
import json
with open(meta_path, 'r') as f:
meta = json.load(f)
meta['id'] = dataset_name
with open(meta_path, 'w') as f:
json.dump(meta, f, indent=2)

print(f"Successfully downloaded latentscope files to: {latentscope_path}")

except Exception as e:
print(f"Error downloading scope: {e}")
raise

def main():
parser = argparse.ArgumentParser(description='Download a latentscope dataset from Hugging Face')
parser.add_argument('dataset_repo', help='Path to the dataset on Hugging Face (e.g., username/dataset-name)')
parser.add_argument('dataset_name', help='Name of the dataset')
parser.add_argument('output_dir', help='Local directory to save the downloaded files')
parser.add_argument('--token', help='Hugging Face API token', default=None)

args = parser.parse_args()

download_from_huggingface(args.dataset_repo, args.dataset_name, args.output_dir, args.token)

if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
"""
python latentscope/scripts/upload_scope.py ~/latent-scope-demo/datavis-misunderstood "ls-datavis-misunderstood" --main-parquet="scopes/scopes-001-input.parquet" --private=False
"""

from datasets import Dataset
from pathlib import Path
from huggingface_hub import login, HfApi
Expand All @@ -7,6 +11,14 @@
import tempfile
from latentscope.util import get_key

def get_human_readable_size(size_in_bytes):
"""Convert bytes to human readable format."""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size_in_bytes < 1024.0:
return f"{size_in_bytes:.1f} {unit}"
size_in_bytes /= 1024.0
return f"{size_in_bytes:.1f} PB"

def upload_to_huggingface(directory_path, dataset_name, main_parquet_path=None, token=None, private=True):
"""
Upload a directory with all its files to Hugging Face datasets.
Expand Down Expand Up @@ -67,6 +79,15 @@ def upload_to_huggingface(directory_path, dataset_name, main_parquet_path=None,
data_dir = temp_path / "data"
data_dir.mkdir(exist_ok=True)

# Calculate total size before copying files
total_size = 0
for root, _, filenames in os.walk(directory):
for filename in filenames:
file_path = Path(root) / filename
total_size += file_path.stat().st_size

human_readable_size = get_human_readable_size(total_size)

# Copy all files to the data directory while preserving structure
for root, _, filenames in os.walk(directory):
for filename in filenames:
Expand Down Expand Up @@ -108,6 +129,8 @@ def upload_to_huggingface(directory_path, dataset_name, main_parquet_path=None,
This dataset contains the files necessary to view in [latentscope](https://github.com/enjalot/latent-scope).
The files in the `latentscope` are used by the app to view. You can also preview the scope TODO
Total size of dataset files: {human_readable_size}
TODO: download script inside latentscope
"""
readme_path = temp_path / "README.md"
Expand All @@ -121,8 +144,7 @@ def upload_to_huggingface(directory_path, dataset_name, main_parquet_path=None,
path_in_repo="README.md",
repo_type="dataset",
)
print("README UPLOADED")
print("ALL DONE")
print(f"uploaded to: {username}/{dataset_name}")

def main():
parser = argparse.ArgumentParser(description='Upload a directory with files to Hugging Face datasets')
Expand Down
7 changes: 5 additions & 2 deletions latentscope/server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,18 +307,21 @@ def update_settings():
@app.route('/api/settings', methods=['GET'])
def get_settings():
config = dotenv_values(".env") # Assuming the .env file is in the root directory
supported_api_keys = get_supported_api_keys()
settings = {
"data_dir": config["LATENT_SCOPE_DATA"],
"api_keys": [key for key in config if "_API_KEY" in key ],
"supported_api_keys": get_supported_api_keys(),
"api_keys": [key for key in config if key in supported_api_keys],
"supported_api_keys": supported_api_keys,
"env_file": os.path.abspath(".env")
}
return jsonify(settings)

@app.route('/api/version', methods=['GET'])
def get_version():
print("GET VERSION", __version__)
return __version__


@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')
def catch_all(path):
Expand Down
23 changes: 23 additions & 0 deletions latentscope/server/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def run_job(dataset, job_id, command):

@jobs_bp.route('/job')
def get_job():
print("get_job", request.args)
dataset = request.args.get('dataset')
job_id = request.args.get('job_id')
progress_file = os.path.join(DATA_DIR, dataset, "jobs", f"{job_id}.json")
Expand Down Expand Up @@ -463,3 +464,25 @@ def run_plot():
threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
return jsonify({"job_id": job_id})

@jobs_write_bp.route('/download_dataset')
def download_dataset():
dataset_repo = request.args.get('dataset_repo')
dataset_name = request.args.get('dataset_name')

job_id = str(uuid.uuid4())
command = f'python latentscope/scripts/download_dataset.py "{dataset_repo}" "{dataset_name}" "{DATA_DIR}"'
threading.Thread(target=run_job, args=(dataset_name, job_id, command)).start()
return jsonify({"job_id": job_id})

@jobs_write_bp.route('/upload_dataset')
def upload_dataset():
dataset = request.args.get('dataset')
hf_dataset = request.args.get('hf_dataset')
main_parquet = request.args.get('main_parquet')
private = request.args.get('private')

job_id = str(uuid.uuid4())
path = os.path.join(DATA_DIR, dataset)
command = f'python latentscope/scripts/upload_dataset.py "{path}" "{hf_dataset}" --main-parquet="{main_parquet}" --private={private}'
threading.Thread(target=run_job, args=(dataset, job_id, command)).start()
return jsonify({"job_id": job_id})
3 changes: 2 additions & 1 deletion latentscope/util/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def get_supported_api_keys():
"VOYAGE_API_KEY",
"TOGETHER_API_KEY",
"COHERE_API_KEY",
"MISTRAL_API_KEY"
"MISTRAL_API_KEY",
"HUGGINGFACE_TOKEN"
]

def set_openai_key(openai_key, env_file=".env"):
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ jupyterlab-widgets~=3.0.9
jupyterlab_pygments~=0.3.0
jupyterlab_server~=2.25.2
kiwisolver~=1.4.5
latentsae~=0.1.0
llvmlite~=0.42.0
MarkupSafe~=2.1.3
matplotlib
Expand All @@ -95,6 +96,7 @@ numba~=0.59.0
openai~=1.12.0
opt-einsum
orjson~=3.9.12
outlines~=0.1.0
overrides~=7.7.0
packaging~=23.2
pandas
Expand Down
15 changes: 4 additions & 11 deletions web/src/components/Explore/ScopeHeader.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,9 @@ import { compareVersions } from 'compare-versions';
import { apiService } from '../../lib/apiService';
import { isMobileDevice } from '../../utils';

const readonly = import.meta.env.MODE == "read_only"
const readonly = import.meta.env.MODE == 'read_only';

function DatasetHeader({
dataset,
scope,
scopes,
onScopeChange,
tags,
deletedIndices
}) {
function DatasetHeader({ dataset, scope, scopes, onScopeChange, tags, deletedIndices }) {
if (!dataset) return null;

const [lsVersion, setLsVersion] = useState(null);
Expand Down Expand Up @@ -137,7 +130,7 @@ DatasetHeader.propTypes = {
scopes: PropTypes.array.isRequired,
onScopeChange: PropTypes.func.isRequired,
isMobileDevice: PropTypes.bool,
tags: PropTypes.array.isRequired
tags: PropTypes.array.isRequired,
};

export default DatasetHeader;
export default DatasetHeader;
102 changes: 102 additions & 0 deletions web/src/components/HFDownload.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import { useState, useCallback, useEffect, useMemo } from 'react';
import { Button, Input } from 'react-element-forge';
import ModelSelect from './ModelSelect';
import JobProgress from './Job/Progress';
import { useStartJobPolling } from './Job/Run';
import { apiService, apiUrl } from '../lib/apiService';

import styles from './HFDownload.module.scss';

function HFDownload({ onComplete }) {
const [selectedDataset, setSelectedDataset] = useState(null);
const [datasetName, setDatasetName] = useState('');
const [HFDatasets, setHFDatasets] = useState([]);
const [downloadJob, setDownloadJob] = useState(null);

const dataset = useMemo(() => {
return { id: datasetName };
}, [datasetName]);

const { startJob: startDownloadJob } = useStartJobPolling(
dataset,
setDownloadJob,
`${apiUrl}/jobs/download_dataset`
);

// Search HuggingFace datasets with tag "latent-scope"
const searchHFDatasets = useCallback((query) => {
apiService.searchHFDatasets(query).then((datasets) => {
setHFDatasets(datasets);
});
}, []);

useEffect(() => {
searchHFDatasets('');
}, [searchHFDatasets]);

const handleDatasetSelect = useCallback(
(selected) => {
setSelectedDataset(selected);
// Set a default name based on the selected dataset
setDatasetName(selected.name.split('/').pop());
},
[setSelectedDataset, setDatasetName]
);

const handleDownload = useCallback(
(e) => {
e.preventDefault();
if (!selectedDataset || !datasetName) return;

startDownloadJob({
dataset_repo: selectedDataset.name,
dataset_name: datasetName,
});
},
[selectedDataset, datasetName, startDownloadJob]
);

// When job completes, notify parent
useEffect(() => {
if (downloadJob?.status === 'completed' && onComplete) {
onComplete(datasetName);
setDownloadJob(null);
}
}, [downloadJob, datasetName, onComplete]);

const allOptionsGrouped = [
{
label: 'Latent Scope Datasets',
options: HFDatasets,
},
];

return (
<div className={styles.downloader}>
<div className={styles.selector}>
<ModelSelect
options={allOptionsGrouped}
onChange={handleDatasetSelect}
onInputChange={searchHFDatasets}
placeholder="Search Latent Scope datasets..."
/>
</div>
{selectedDataset ? (
<Input
label="Save as"
value={datasetName}
onChange={(e) => setDatasetName(e.target.value)}
/>
) : null}
<Button
onClick={handleDownload}
disabled={!selectedDataset}
variant="outline"
text={`Download ${datasetName}`}
/>
{downloadJob && <JobProgress job={downloadJob} clearJob={() => setDownloadJob(null)} />}
</div>
);
}

export default HFDownload;
Loading

0 comments on commit 75f26c8

Please sign in to comment.