Skip to content

Commit

Permalink
Add direct_support keys in `core_eia860__scd_generators_energy_storag…
Browse files Browse the repository at this point in the history
…e` as foreign keys to `core_eia__entity_generators` (catalyst-cooperative#3699)

* Add direct_support keys to FK

* Try to harvest gen and plant IDs from direct storage cols

* Add migrations

* Fix mapped schemas

* Make 3 migrations into 1, remove boiler mapped_schema

* Update EIA row numbers

* Update dependencies and fix new Ruff linting errors.

* Add unmapped EIA plant IDs to mapping spreadsheet

* Merge in main

* Drop troublesome rows

* Update row counts

* Fix alembic migrations

* Drop empty generator, reorder _out_eia__yearly_generators merge to drop rows with existing plant data, no generator data

---------

Co-authored-by: Zane Selvans <[email protected]>
  • Loading branch information
e-belfer and zaneselvans authored Jul 25, 2024
1 parent 30d23ac commit e47be85
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 12 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""Update direct_support plant IDs to integers, create FK relationship between direct support IDs and harvested entities
Revision ID: 49d2f4f7d7b7
Revises: b9b6cb1a5405
Create Date: 2024-06-26 13:38:03.884714
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = '49d2f4f7d7b7'
down_revision = 'aee9c15c7394'
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('core_eia860__scd_generators_energy_storage', schema=None) as batch_op:
batch_op.alter_column('plant_id_eia_direct_support_1',
existing_type=sa.FLOAT(),
type_=sa.Integer(),
existing_nullable=True)
batch_op.alter_column('plant_id_eia_direct_support_2',
existing_type=sa.FLOAT(),
type_=sa.Integer(),
existing_nullable=True)
batch_op.alter_column('plant_id_eia_direct_support_3',
existing_type=sa.FLOAT(),
type_=sa.Integer(),
existing_nullable=True)
batch_op.create_foreign_key(batch_op.f('fk_core_eia860__scd_generators_energy_storage_plant_id_eia_direct_support_2_core_eia__entity_generators'), 'core_eia__entity_generators', ['plant_id_eia_direct_support_2', 'generator_id_direct_support_2'], ['plant_id_eia', 'generator_id'])
batch_op.create_foreign_key(batch_op.f('fk_core_eia860__scd_generators_energy_storage_plant_id_eia_direct_support_1_core_eia__entity_generators'), 'core_eia__entity_generators', ['plant_id_eia_direct_support_1', 'generator_id_direct_support_1'], ['plant_id_eia', 'generator_id'])
batch_op.create_foreign_key(batch_op.f('fk_core_eia860__scd_generators_energy_storage_plant_id_eia_direct_support_3_core_eia__entity_generators'), 'core_eia__entity_generators', ['plant_id_eia_direct_support_3', 'generator_id_direct_support_3'], ['plant_id_eia', 'generator_id'])

# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('core_eia860__scd_generators_energy_storage', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_core_eia860__scd_generators_energy_storage_plant_id_eia_direct_support_3_core_eia__entity_generators'), type_='foreignkey')
batch_op.drop_constraint(batch_op.f('fk_core_eia860__scd_generators_energy_storage_plant_id_eia_direct_support_1_core_eia__entity_generators'), type_='foreignkey')
batch_op.drop_constraint(batch_op.f('fk_core_eia860__scd_generators_energy_storage_plant_id_eia_direct_support_2_core_eia__entity_generators'), type_='foreignkey')
batch_op.alter_column('plant_id_eia_direct_support_3',
existing_type=sa.Integer(),
type_=sa.FLOAT(),
existing_nullable=True)
batch_op.alter_column('plant_id_eia_direct_support_2',
existing_type=sa.Integer(),
type_=sa.FLOAT(),
existing_nullable=True)
batch_op.alter_column('plant_id_eia_direct_support_1',
existing_type=sa.Integer(),
type_=sa.FLOAT(),
existing_nullable=True)

# ### end Alembic commands ###
27 changes: 27 additions & 0 deletions src/pudl/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,33 @@ def drop_records_with_null_in_column(
return df.dropna(subset=[column])


def drop_all_null_records_with_multiindex(
df: pd.DataFrame, idx_cols: list[str], idx_records: list[tuple[str | int | bool]]
) -> pd.DataFrame:
"""Given a set of multi-index values, drop expected all null rows.
Take a dataframe, and check that a row with given values in idx_cols (e.g.,
plant_id_eia, generator_id) is null in all other rows. If so, drop these rows from
the dataframe. If not, raise an assertion error to prevent accidentally dropping
data.
Args:
df: table with data to drop.
idx_cols: list of multi-index columns to index against.
idx_records: corresponding index values for each row to be dropped.
Raises:
AssertionError: If there is data in the expected rows.
"""
# ensure there isn't more than the expected number of nulls before dropping
df = df.set_index(idx_cols)
assert df.loc[idx_records].isnull().all().all(), (
"Non-null data found where no data was expected:",
f"{df.loc[idx_records].dropna(axis='columns', how='all')}",
) # Make sure all values in all rows and columns here are null
return df.drop(idx_records).reset_index()


def standardize_percentages_ratio(
frac_df: pd.DataFrame,
mixed_cols: list[str],
Expand Down
6 changes: 3 additions & 3 deletions src/pudl/metadata/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -4886,7 +4886,7 @@
),
},
"plant_id_eia_direct_support_1": {
"type": "number",
"type": "integer",
"description": (
"The EIA Plant ID of the primary unit whose generation this energy storage "
"device is intended to firm or store."
Expand All @@ -4900,7 +4900,7 @@
),
},
"plant_id_eia_direct_support_2": {
"type": "number",
"type": "integer",
"description": (
"The EIA Plant ID of the secondary unit whose generation this energy storage "
"device is intended to firm or store."
Expand All @@ -4914,7 +4914,7 @@
),
},
"plant_id_eia_direct_support_3": {
"type": "number",
"type": "integer",
"description": (
"The EIA Plant ID of the tertiary unit whose generation this energy storage "
"device is intended to firm or store."
Expand Down
15 changes: 15 additions & 0 deletions src/pudl/metadata/resources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@
],
"mapped_schemas": [
{"operator_utility_id_eia": "utility_id_eia"},
{"plant_id_eia_direct_support_1": "plant_id_eia"},
{"plant_id_eia_direct_support_2": "plant_id_eia"},
{"plant_id_eia_direct_support_3": "plant_id_eia"},
],
},
"generators": {
Expand Down Expand Up @@ -167,6 +170,18 @@
],
"mapped_schemas": [
{"operator_utility_id_eia": "utility_id_eia"},
{
"plant_id_eia_direct_support_1": "plant_id_eia",
"generator_id_direct_support_1": "generator_id",
},
{
"plant_id_eia_direct_support_2": "plant_id_eia",
"generator_id_direct_support_2": "generator_id",
},
{
"plant_id_eia_direct_support_3": "plant_id_eia",
"generator_id_direct_support_3": "generator_id",
},
],
},
"boilers": {
Expand Down
7 changes: 6 additions & 1 deletion src/pudl/metadata/resources/eia.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,12 @@
],
"primary_key": ["plant_id_eia", "generator_id"],
"foreign_key_rules": {
"fields": [["plant_id_eia", "generator_id"]],
"fields": [
["plant_id_eia", "generator_id"],
["plant_id_eia_direct_support_1", "generator_id_direct_support_1"],
["plant_id_eia_direct_support_2", "generator_id_direct_support_2"],
["plant_id_eia_direct_support_3", "generator_id_direct_support_3"],
],
# exclude core_epa__assn_eia_epacamd_subplant_ids bc there are generator ids in this
# glue table that come only from epacamd
# also exclude the 860 changelog table bc that table doesn't get harvested
Expand Down
30 changes: 25 additions & 5 deletions src/pudl/output/eia.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from dagster import Field, asset

import pudl
from pudl.helpers import drop_all_null_records_with_multiindex
from pudl.transform.eia import occurrence_consistency
from pudl.transform.eia861 import add_backfilled_ba_code_column

Expand Down Expand Up @@ -136,18 +137,37 @@ def _out_eia__yearly_generators(
A DataFrame containing all the fields of the EIA 860 Utilities table.
"""
# Almost all the info we need will come from here.

out_df = pd.merge(
core_eia860__scd_generators,
core_eia__entity_plants,
core_eia__entity_generators,
how="left",
on=["plant_id_eia"],
on=["plant_id_eia", "generator_id"],
)

# If any generator data is completely empty, drop it.
# These are five known generators that originate from harvesting the plant and
# generator IDs found in the plant_id_eia_direct_support_x and
# generator_id_direct_support_x in EIA 860 energy storage tables, in
# order to enable foreign key relationships with these columns.
# They do not show up in any other tables and thus lack data in all columns.
# For more, see issue #3695 and PR #3699.
empty_generator_ids = [
(9170, "3093", "2023-01-01"),
(18170, "B8170", "2023-01-01"),
(34516, "SOL1", "2023-01-01"),
(64966, "GEN1", "2023-01-01"),
(60321, "PV1", "2023-01-01"),
]
out_df = drop_all_null_records_with_multiindex(
out_df, ["plant_id_eia", "generator_id", "report_date"], empty_generator_ids
)

# Add core entity data about EIA plants
out_df = pd.merge(
out_df,
core_eia__entity_generators,
core_eia__entity_plants,
how="left",
on=["plant_id_eia", "generator_id"],
on=["plant_id_eia"],
)

out_df.report_date = pd.to_datetime(out_df.report_date)
Expand Down
Binary file modified src/pudl/package_data/glue/pudl_id_mapping.xlsx
Binary file not shown.
4 changes: 3 additions & 1 deletion src/pudl/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,9 @@ def no_null_rows(df, cols="all", df_name="", thresh=0.9):

null_rows = df[cols].isna().sum(axis="columns") / len(cols) > thresh
if null_rows.any():
raise ValueError(f"Found {null_rows.sum(axis='rows')} Null rows in {df_name}.")
raise ValueError(
f"Found {null_rows.sum(axis='rows')} null rows in {df_name}./n {df[null_rows]}"
)

return df

Expand Down
4 changes: 2 additions & 2 deletions test/validate/eia_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ def test_no_null_cols_eia(pudl_out_eia, live_dbs, cols, df_name):
("boil_eia860", 89_051, 89_051, 89_051),
("frc_eia923", 673_343, 274_479, 26_709),
("gen_eia923", None, 5_494_932, 459_711),
("gens_eia860", 590_839, 590_839, 590_839),
("gens_eia860", 590_882, 590_882, 590_882),
("gf_eia923", 3_064_042, 3_064_042, 260_842),
("own_eia860", 95_104, 95_104, 95_104),
("plants_eia860", 215_878, 215_878, 215_878),
("plants_eia860", 215_884, 215_884, 215_884),
("pu_eia860", 214_965, 214_965, 214_965),
("utils_eia860", 147_877, 147_877, 147_877),
("emissions_control_equipment_eia860", 62_102, 62_102, 62_102),
Expand Down

0 comments on commit e47be85

Please sign in to comment.