Skip to content

Commit

Permalink
small updates
Browse files Browse the repository at this point in the history
  • Loading branch information
gkucsko committed Apr 21, 2023
1 parent d53b43e commit 9751cfb
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 10 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ audio_array = generate_audio(text_prompt)

### 🎤 Voice Presets and Voice/Audio Cloning

Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. The model also attempts to preserve music, ambient noise, etc. from input audio. However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from for each language. Specify following the pattern: `{lang_code}_speaker_{number}`.
Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. The model also attempts to preserve music, ambient noise, etc. from input audio. However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from for each language. Specify following the pattern: `{lang_code}_speaker_{0-9}`.

```python
text_prompt = """
Expand Down
16 changes: 14 additions & 2 deletions bark/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@ def text_to_semantic(
text: str,
history_prompt: Optional[str] = None,
temp: float = 0.7,
silent: bool = False,
):
"""Generate semantic array from text.
Args:
text: text to be turned into audio
history_prompt: history choice for audio cloning
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
silent: disable progress bar
Returns:
numpy semantic array to be fed into `semantic_to_waveform`
Expand All @@ -24,6 +26,7 @@ def text_to_semantic(
text,
history_prompt=history_prompt,
temp=temp,
silent=silent,
)
return x_semantic

Expand All @@ -32,13 +35,15 @@ def semantic_to_waveform(
semantic_tokens: np.ndarray,
history_prompt: Optional[str] = None,
temp: float = 0.7,
silent: bool = False,
):
"""Generate audio array from semantic input.
Args:
semantic_tokens: semantic token output from `text_to_semantic`
history_prompt: history choice for audio cloning
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
silent: disable progress bar
Returns:
numpy audio array at sample frequency 24khz
Expand All @@ -47,6 +52,7 @@ def semantic_to_waveform(
semantic_tokens,
history_prompt=history_prompt,
temp=temp,
silent=silent,
)
x_fine_gen = generate_fine(
x_coarse_gen,
Expand All @@ -62,6 +68,7 @@ def generate_audio(
history_prompt: Optional[str] = None,
text_temp: float = 0.7,
waveform_temp: float = 0.7,
silent: bool = False,
):
"""Generate audio array from input text.
Expand All @@ -70,10 +77,15 @@ def generate_audio(
history_prompt: history choice for audio cloning
text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
silent: disable progress bar
Returns:
numpy audio array at sample frequency 24khz
"""
x_semantic = text_to_semantic(text, history_prompt=history_prompt, temp=text_temp)
audio_arr = semantic_to_waveform(x_semantic, history_prompt=history_prompt, temp=waveform_temp)
x_semantic = text_to_semantic(
text, history_prompt=history_prompt, temp=text_temp, silent=silent,
)
audio_arr = semantic_to_waveform(
x_semantic, history_prompt=history_prompt, temp=waveform_temp, silent=silent,
)
return audio_arr
12 changes: 6 additions & 6 deletions bark/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,9 @@ def _parse_s3_filepath(s3_filepath):
def _download(from_s3_path, to_local_path):
os.makedirs(CACHE_DIR, exist_ok=True)
response = requests.get(from_s3_path, stream=True)
total_size_in_bytes = int(response.headers.get('content-length', 0))
block_size = 1024 # 1 Kibibyte
progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
total_size_in_bytes = int(response.headers.get("content-length", 0))
block_size = 1024
progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(to_local_path, "wb") as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
Expand Down Expand Up @@ -191,7 +191,7 @@ def clean_models(model_key=None):

def _load_model(ckpt_path, device, model_type="text"):
if "cuda" not in device:
logger.warning("No GPU being used. Careful, Inference might be extremely slow!")
logger.warning("No GPU being used. Careful, inference might be extremely slow!")
if model_type == "text":
ConfigClass = GPTConfig
ModelClass = GPT
Expand All @@ -207,10 +207,10 @@ def _load_model(ckpt_path, device, model_type="text"):
os.path.exists(ckpt_path) and
_md5(ckpt_path) != REMOTE_MODEL_PATHS[model_type]["checksum"]
):
logger.warning(f"found outdated {model_type} model, removing...")
logger.warning(f"found outdated {model_type} model, removing.")
os.remove(ckpt_path)
if not os.path.exists(ckpt_path):
logger.info(f"{model_type} model not found, downloading...")
logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
_download(REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path)
checkpoint = torch.load(ckpt_path, map_location=device)
# this is a hack
Expand Down
2 changes: 1 addition & 1 deletion model-card.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ The following is additional information about the models released here.

Bark is a series of three transformer models that turn text into audio.
### Text to semantic tokens
- Input: text, tokenized with [BERT tokenizer from huggingface](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer)
- Input: text, tokenized with [BERT tokenizer from Hugging Face](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer)
- Output: semantic tokens that encode the audio to be generated

### Semantic to coarse tokens
Expand Down

0 comments on commit 9751cfb

Please sign in to comment.