Source code for gridstatus.ieso

import datetime
import os
import re
import time
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Literal

import pandas as pd
import requests
import xmltodict

from gridstatus import utils
from gridstatus.base import ISOBase, NotSupported
from gridstatus.decorators import support_date_range
from gridstatus.gs_logging import logger

[docs]CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
[docs]CERTIFICATES_CHAIN_FILE = os.path.join( CURRENT_DIR, "public_certificates/ieso/intermediate_and_root.pem", )
"""LOAD CONSTANTS""" # Load hourly files go back 30 days
[docs]MAXIMUM_DAYS_IN_PAST_FOR_LOAD = 30
[docs]LOAD_INDEX_URL = "https://reports-public.ieso.ca/public/RealtimeConstTotals"
# Each load file covers one hour. We have to use the xml instead of the csv because # the csv does not have demand for Ontario.
[docs]LOAD_TEMPLATE_URL = f"{LOAD_INDEX_URL}/PUB_RealtimeConstTotals_YYYYMMDDHH.xml"
"""LOAD FORECAST CONSTANTS""" # There's only one load forecast for Ontario. This data covers from 5 days ago # through tomorrow
[docs]LOAD_FORECAST_URL = ( "https://www.ieso.ca/-/media/Files/IESO/Power-Data/Ontario-Demand-multiday.ashx" )
"""ZONAL LOAD FORECAST CONSTANTS"""
[docs]ZONAL_LOAD_FORECAST_INDEX_URL = ( "https://reports-public.ieso.ca/public/OntarioZonalDemand" )
# Each forecast file contains data from the day in the filename going forward for # 34 days. The most recent file does not have a date in the filename.
[docs]ZONAL_LOAD_FORECAST_TEMPLATE_URL = ( f"{ZONAL_LOAD_FORECAST_INDEX_URL}/PUB_OntarioZonalDemand_YYYYMMDD.xml" )
# The farthest in the past that forecast files are available
[docs]MAXIMUM_DAYS_IN_PAST_FOR_ZONAL_LOAD_FORECAST = 90
# The farthest in the future that forecasts are available. Note that there are not # files for these future forecasts, they are in the current day's file.
[docs]MAXIMUM_DAYS_IN_FUTURE_FOR_ZONAL_LOAD_FORECAST = 34
"""REAL TIME FUEL MIX CONSTANTS"""
[docs]FUEL_MIX_INDEX_URL = "https://reports-public.ieso.ca/public/GenOutputCapability/"
# Updated every hour and each file has data for one day. # The most recent version does not have the date in the filename.
[docs]FUEL_MIX_TEMPLATE_URL = f"{FUEL_MIX_INDEX_URL}/PUB_GenOutputCapability_YYYYMMDD.xml"
# Number of past days for which the complete generator report is available. # Before this date, only total by fuel type is available.
[docs]MAXIMUM_DAYS_IN_PAST_FOR_COMPLETE_GENERATOR_REPORT = 90
"""HISTORICAL FUEL MIX CONSTANTS"""
[docs]HISTORICAL_FUEL_MIX_INDEX_URL = ( "https://reports-public.ieso.ca/public/GenOutputbyFuelHourly/" )
# Updated once a day and each file contains data for an entire year.
[docs]HISTORICAL_FUEL_MIX_TEMPLATE_URL = ( f"{HISTORICAL_FUEL_MIX_INDEX_URL}/PUB_GenOutputbyFuelHourly_YYYY.xml" )
[docs]MINUTES_INTERVAL = 5
[docs]HOUR_INTERVAL = 1
# Default namespace used in the XML files
[docs]NAMESPACES_FOR_XML = {"": "http://www.ieso.ca/schema"}
[docs]class IESO(ISOBase): """Independent Electricity System Operator (IESO)""" name = "Independent Electricity System Operator" iso_id = "ieso" # All data is provided in EST, and does not change with DST. This means there are # no repeated or missing hours in the raw data and we can safely use tz_localize # without setting ambiguous or nonexistent times. # https://www.ieso.ca/-/media/Files/IESO/Document-Library/engage/ca/ca-Introduction-to-the-Capacity-Auction.ashx default_timezone = "EST" status_homepage = "https://www.ieso.ca/en/Power-Data" @support_date_range(frequency="HOUR_START")
[docs] def get_load( self, date: str | datetime.date | datetime.datetime, end: datetime.date | datetime.datetime | None = None, verbose: bool = False, ): """ Get 5-minute load for the Market and Ontario for a given date or from date to end date. Args: date (datetime.date | datetime.datetime | str): The date to get the load for Can be a `datetime.date` or `datetime.datetime` object, or a string with the values "today" or "latest". If `end` is None, returns only data for this date. end (datetime.date | datetime.datetime, optional): End date. Defaults None If provided, returns data from `date` to `end` date. The `end` can be a `datetime.date` or `datetime.datetime` object. verbose (bool, optional): Print verbose output. Defaults to False. frequency (str, optional): Frequency of data. Defaults to "5min". Returns: pd.DataFrame: zonal load as a wide table with columns for each zone """ today = utils._handle_date("today", tz=self.default_timezone) if date != "latest": if date.date() > today.date(): raise NotSupported("Load data is not available for future dates.") if date.date() < today.date() - pd.Timedelta( days=MAXIMUM_DAYS_IN_PAST_FOR_LOAD, ): raise NotSupported( f"Load data is not available for dates more than " f"{MAXIMUM_DAYS_IN_PAST_FOR_LOAD} days in the past.", ) # Return an empty dataframe when the date exceeds the current timestamp # since there's no load available yet. if date > pd.Timestamp.now(tz=self.default_timezone): return pd.DataFrame() elif date == "latest": date = pd.Timestamp.now(tz=self.default_timezone) df = self._retrieve_5_minute_load(date, end, verbose) cols_to_keep = [ "Interval Start", "Interval End", "Market Total Load", "Ontario Load", ] df["Market Total Load"] = df["Market Total Load"].astype(float) df["Ontario Load"] = df["Ontario Load"].astype(float) return utils.move_cols_to_front(df, cols_to_keep)[cols_to_keep].reset_index( drop=True, )
def _retrieve_5_minute_load( self, date: datetime.datetime, end: datetime.datetime | None = None, verbose: bool = False, ): # We have to add 1 to the hour to get the file because the filename with # hour x contains data for hour x-1. For example, to get data for # 9:00 - 9:55, we need to request the file for hour 10. # The hour should be in the range 1-24 hour = date.hour + 1 url = LOAD_TEMPLATE_URL.replace( "YYYYMMDDHH", f"{(date).strftime('%Y%m%d')}{hour:02d}", ) r = self._request(url, verbose) root = ET.fromstring(r.text) # Extracting all triples of Interval, Market Total Load, and Ontario Load interval_loads_and_demands = self._find_loads_at_each_interval_from_xml(root) df = pd.DataFrame( interval_loads_and_demands, columns=["Interval", "Market Total Load", "Ontario Load"], ) delivery_date = root.find("DocBody/DeliveryDate", NAMESPACES_FOR_XML).text delivery_hour = int(root.find("DocBody/DeliveryHour", NAMESPACES_FOR_XML).text) df["Delivery Date"] = pd.Timestamp(delivery_date, tz=self.default_timezone) # The starting hour is 1, so we subtract 1 to get the hour in the range 0-23 df["Delivery Hour Start"] = delivery_hour - 1 # Multiply the interval minus 1 by 5 to get the minutes in the range 0-55 df["Interval Minute Start"] = MINUTES_INTERVAL * (df["Interval"] - 1) df["Interval Start"] = ( df["Delivery Date"] + pd.to_timedelta(df["Delivery Hour Start"], unit="h") + pd.to_timedelta(df["Interval Minute Start"], unit="m") ) df["Interval End"] = df["Interval Start"] + pd.Timedelta( minutes=MINUTES_INTERVAL, ) if end: return df[df["Interval End"] <= pd.Timestamp(end)] return df
[docs] def get_load_forecast(self, date: str, verbose: bool = False): """ Get forecasted load for Ontario. Supports only "latest" and "today" because there is only one load forecast. Args: date (str): Either "today" or "latest" verbose (bool, optional): Print verbose output. Defaults to False. Returns: pd.DataFrame: Ontario load forecast """ if date not in ["today", "latest"]: raise NotSupported( "Only 'today' and 'latest' are supported for load forecasts.", ) root = ET.fromstring(self._request(LOAD_FORECAST_URL, verbose).text) # Extract values from <DataSet Series="Projected"> projected_values = [] # Iterate through the XML to find the DataSet with Series="Projected" for dataset in root.iter("DataSet"): if dataset.attrib.get("Series") == "Projected": for data in dataset.iter("Data"): for value in data.iter("Value"): projected_values.append(value.text) created_at = pd.Timestamp( root.find(".//CreatedAt").text, tz=self.default_timezone, ) start_date = pd.Timestamp( root.find(".//StartDate").text, tz=self.default_timezone, ) # Create the range of interval starts based on the number of values at an # hourly frequency interval_starts = pd.date_range( start_date, periods=len(projected_values), freq="h", tz=self.default_timezone, ) # Create a DataFrame with the projected values df_projected = pd.DataFrame(projected_values, columns=["Ontario Load Forecast"]) df_projected["Ontario Load Forecast"] = df_projected[ "Ontario Load Forecast" ].astype(float) df_projected["Publish Time"] = created_at df_projected["Interval Start"] = interval_starts df_projected["Interval End"] = df_projected["Interval Start"] + pd.Timedelta( hours=HOUR_INTERVAL, ) return utils.move_cols_to_front( df_projected, [ "Interval Start", "Interval End", "Publish Time", "Ontario Load Forecast", ], )
@support_date_range(frequency="DAY_START")
[docs] def get_zonal_load_forecast( self, date: str | datetime.date | tuple[datetime.date, datetime.date], end: datetime.date | datetime.datetime | None = None, verbose: bool = False, ): """ Get forecasted load by forecast zone (Ontario, East, West) for a given date or from date to end date. This method supports future dates. Supports data 90 days into the past and up to 34 days into the future. Args: date (datetime.date | datetime.datetime | str): The date to get the load for Can be a `datetime.date` or `datetime.datetime` object, or a string with the values "today" or "latest". If `end` is None, returns only data for this date. end (datetime.date | datetime.datetime, optional): End date. Defaults None If provided, returns data from `date` to `end` date. The `end` can be a `datetime.date` or `datetime.datetime` object. verbose (bool, optional): Print verbose output. Defaults to False. Returns: pd.DataFrame: forecasted load as a wide table with columns for each zone """ today = utils._handle_date("today", tz=self.default_timezone) if date != "latest": date = utils._handle_date(date, tz=self.default_timezone) if date.date() < today.date() - pd.Timedelta( days=MAXIMUM_DAYS_IN_PAST_FOR_ZONAL_LOAD_FORECAST, ): # Forecasts are not support for past dates raise NotSupported( "Past dates are not support for load forecasts more than " f"{MAXIMUM_DAYS_IN_PAST_FOR_ZONAL_LOAD_FORECAST} days in the past.", ) if date.date() > today.date() + pd.Timedelta( days=MAXIMUM_DAYS_IN_FUTURE_FOR_ZONAL_LOAD_FORECAST, ): raise NotSupported( f"Dates more than {MAXIMUM_DAYS_IN_FUTURE_FOR_ZONAL_LOAD_FORECAST} " "days in the future are not supported for load forecasts.", ) # For future dates, the most recent forecast is used if date == "latest" or date.date() > today.date(): url = ZONAL_LOAD_FORECAST_TEMPLATE_URL.replace("_YYYYMMDD", "") else: url = ZONAL_LOAD_FORECAST_TEMPLATE_URL.replace( "YYYYMMDD", date.strftime("%Y%m%d"), ) r = self._request(url, verbose) # Initialize a list to store the parsed data data = [] # Parse the XML file root = ET.fromstring(r.content) published_time = root.find(".//CreatedAt", NAMESPACES_FOR_XML).text # Extracting data for each ZonalDemands within the Document for zonal_demands in root.findall(".//ZonalDemands", NAMESPACES_FOR_XML): delivery_date = zonal_demands.find( ".//DeliveryDate", NAMESPACES_FOR_XML, ).text for zonal_demand in zonal_demands.findall( ".//ZonalDemand/*", NAMESPACES_FOR_XML, ): # The zone name is the tag name without the namespace zone_name = zonal_demand.tag[(zonal_demand.tag.rfind("}") + 1) :] for demand in zonal_demand.findall(".//Demand", NAMESPACES_FOR_XML): hour = demand.find(".//DeliveryHour", NAMESPACES_FOR_XML).text energy_mw = demand.find(".//EnergyMW", NAMESPACES_FOR_XML).text data.append( { "DeliveryDate": delivery_date, "Zone": zone_name, "DeliveryHour": hour, "EnergyMW": energy_mw, }, ) df = pd.DataFrame(data) # Convert columns to appropriate data types df["DeliveryHour"] = df["DeliveryHour"].astype(int) df["EnergyMW"] = df["EnergyMW"].astype(float) df["DeliveryDate"] = pd.to_datetime(df["DeliveryDate"]) df["Interval Start"] = ( # Need to subtract 1 from the DeliveryHour since that represents the # ending hour of the interval. (1 represents 00:00 - 01:00) df["DeliveryDate"] + pd.to_timedelta(df["DeliveryHour"] - 1, unit="h") ).dt.tz_localize(self.default_timezone) df["Interval End"] = df["Interval Start"] + pd.Timedelta(hours=HOUR_INTERVAL) # Pivot the table to wide pivot_df = df.pivot_table( index=["Interval Start", "Interval End"], columns="Zone", values="EnergyMW", aggfunc="first", ).reset_index() pivot_df["Publish Time"] = pd.Timestamp( published_time, tz=self.default_timezone, ) pivot_df = utils.move_cols_to_front( pivot_df, [ "Interval Start", "Interval End", "Publish Time", "Ontario", ], ) pivot_df.columns.name = None col_mapper = { col: f"{col} Load Forecast" for col in ["Ontario", "East", "West"] } pivot_df = pivot_df.rename(columns=col_mapper) # Return all the values from the latest forecast if date == "latest": return pivot_df # If no end is provided, return data from single date if not end: return pivot_df[pivot_df["Publish Time"].dt.date == date.date()] # Return data from date to end date end_date = utils._handle_date(end, tz=self.default_timezone) return pivot_df[ (pivot_df["Publish Time"] >= date) & (pivot_df["Publish Time"] <= end_date) ]
[docs] def get_fuel_mix( self, date: str | datetime.date | datetime.datetime, end: datetime.date | datetime.datetime | None = None, verbose: bool = False, ): """ Hourly output and capability for each fuel type (summed over all generators) for a given date or from date to end. Variable generators (solar and wind) have a forecast. Args: date (datetime.date | datetime.datetime | str): The date to get the load for Can be a `datetime.date` or `datetime.datetime` object, or a string with the values "today" or "latest". If `end` is None, returns only data for this date. end (datetime.date | datetime.datetime, optional): End date. Defaults None If provided, returns data from `date` to `end` date. The `end` can be a `datetime.date` or `datetime.datetime` object. verbose (bool, optional): Print verbose output. Defaults to False. Returns: pd.DataFrame: fuel mix """ # Required because this method is not decorated with support_date_range if isinstance(date, tuple): date, end = date # Boolean for whether to use the historical fuel mix data use_historical = False if date != "latest": today = utils._handle_date("today", tz=self.default_timezone) date = utils._handle_date(date, tz=self.default_timezone) if date.date() < today.date() - pd.Timedelta( days=MAXIMUM_DAYS_IN_PAST_FOR_COMPLETE_GENERATOR_REPORT, ): use_historical = True elif date.date() > today.date(): raise NotSupported("Fuel mix data is not available for future dates.") if use_historical: data = self._retrieve_historical_fuel_mix(date, end, verbose) else: data = ( self._retrieve_fuel_mix(date, end, verbose) .groupby(["Fuel Type", "Interval Start", "Interval End"]) .sum(numeric_only=True) .reset_index() ) pivoted = data.pivot_table( index=["Interval Start", "Interval End"], columns="Fuel Type", values="Output MW", ).reset_index() pivoted.columns = [c.title() for c in pivoted.columns] pivoted.index.name = None data = pivoted.copy() data = utils.move_cols_to_front( data, [ "Interval Start", "Interval End", "Biofuel", "Gas", "Hydro", "Nuclear", "Solar", "Wind", ], ) if end: end = utils._handle_date(end, tz=self.default_timezone) return data[ (data["Interval Start"] >= date) & (data["Interval Start"] <= end) ].reset_index(drop=True) elif date == "latest": return data return data[data["Interval Start"] >= date].reset_index(drop=True)
[docs] def get_generator_report_hourly( self, date: str | datetime.date | datetime.datetime, end: datetime.date | datetime.datetime | None = None, verbose: bool = False, ): """ Hourly output for each generator for a given date or from date to end. Variable generators (solar and wind) have a forecast and available capacity. Non-variable generators have a capability. Args: date (datetime.date | datetime.datetime | str): The date to get the load for Can be a `datetime.date` or `datetime.datetime` object, or a string with the values "today" or "latest". If `end` is None, returns only data for this date. end (datetime.date | datetime.datetime, optional): End date. Defaults None If provided, returns data from `date` to `end` date. The `end` can be a `datetime.date` or `datetime.datetime` object. verbose (bool, optional): Print verbose output. Defaults to False. Returns: pd.DataFrame: generator output and capability/available capacity """ # Required because this method is not decorated with support_date_range if isinstance(date, tuple): date, end = date if date != "latest": today = utils._handle_date("today", tz=self.default_timezone) date = utils._handle_date(date, tz=self.default_timezone) if date.date() < today.date() - pd.Timedelta( days=MAXIMUM_DAYS_IN_PAST_FOR_COMPLETE_GENERATOR_REPORT, ): raise NotSupported( f"Generator output and capability data is not available for dates " f"more than {MAXIMUM_DAYS_IN_PAST_FOR_COMPLETE_GENERATOR_REPORT} " "days in the past.", ) elif date.date() > today.date(): raise NotSupported( "Generator output and capability data is not available for future " "dates.", ) data = self._retrieve_fuel_mix(date, end, verbose) data = utils.move_cols_to_front( data, [ "Interval Start", "Interval End", "Generator Name", "Fuel Type", "Output MW", "Capability MW", "Available Capacity MW", "Forecast MW", ], ).sort_values(["Interval Start", "Fuel Type", "Generator Name"]) if end: end = utils._handle_date(end, tz=self.default_timezone) return data[ (data["Interval Start"] >= date) & (data["Interval Start"] <= end) ].reset_index(drop=True) if date == "latest": return data.reset_index(drop=True) return data[data["Interval Start"] >= date].reset_index(drop=True)
@support_date_range(frequency="DAY_START") def _retrieve_fuel_mix( self, date: ( str | datetime.date | datetime.datetime | tuple[datetime.date, datetime.date] ), end: datetime.date | datetime.datetime | None = None, verbose: bool = False, ): """Retrieve fuel mix data for a given date or date range. date (str | date | datetime | tuple[date, date]): The date or date range to retrieve fuel mix data for. end (date | datetime | None, optional): The end date of the date range. Defaults to None. verbose (bool, optional): Whether to print verbose output. Defaults to False. Returns: pd.DataFrame: Fuel mix data """ url = FUEL_MIX_TEMPLATE_URL.replace( "_YYYYMMDD", date.strftime("_%Y%m%d") if date != "latest" else "", ) r = self._request(url, verbose) root = ET.fromstring(r.content) # Define the namespace map. This is different than all the other XML files ns = {"": "http://www.theIMO.com/schema"} date = root.find(".//Date", ns).text data = [] for gen in root.findall(".//Generator", ns): generator_name = gen.find("GeneratorName", ns).text fuel_type = gen.find("FuelType", ns).text for output in gen.findall("Outputs/Output", ns): hour = output.find("Hour", ns).text energy_mw = ( output.find("EnergyMW", ns).text if output.find( "EnergyMW", ns, ) is not None else None ) # For SOLAR/WIND, the forecast is stored under the capability and these # Fuel types have an available capacity. See the schema definition: # http://reports.ieso.ca/docrefs/schema/GenOutputCapability_r3.xsd # There is no capability for these generators. if fuel_type in ["SOLAR", "WIND"]: forecast_mw = ( gen.find(f".//Capabilities/Capability[Hour='{hour}']", ns) .find("EnergyMW", ns) .text ) available_capacity_mw = ( gen.find( f".//Capacities/AvailCapacity[Hour='{hour}']", ns, ) .find("EnergyMW", ns) .text ) capability_mw = None # For non-SOLAR/WIND, there is no forecast or available capacity. # Instead, there is a capability. else: forecast_mw = None capability_mw = ( gen.find( f".//Capabilities/Capability[Hour='{hour}']", ns, ) .find("EnergyMW", ns) .text ) available_capacity_mw = None data.append( [ date, hour, generator_name, fuel_type, energy_mw, capability_mw, available_capacity_mw, forecast_mw, ], ) columns = [ "Date", "Hour", "Generator Name", "Fuel Type", "Output MW", "Capability MW", "Available Capacity MW", "Forecast MW", ] # Creating the DataFrame with the correct date df = pd.DataFrame(data, columns=columns) df["Interval Start"] = ( pd.to_datetime(df["Date"]) + pd.to_timedelta( # Subtract 1 from the hour because hour 1 is from 00:00 - 01:00 df["Hour"].astype(int) - 1, unit="h", ) ).dt.tz_localize(self.default_timezone) df["Interval End"] = df["Interval Start"] + pd.Timedelta(hours=1) float_cols = [ "Output MW", "Capability MW", "Available Capacity MW", "Forecast MW", ] df[float_cols] = df[float_cols].astype(float) return df.drop(columns=["Date", "Hour"]) @support_date_range(frequency="YEAR_START") def _retrieve_historical_fuel_mix( self, date: ( str | datetime.date | datetime.datetime | tuple[datetime.date, datetime.date] ), end: datetime.date | datetime.datetime | None = None, verbose: bool = False, ): date = utils._handle_date(date, tz=self.default_timezone) url = HISTORICAL_FUEL_MIX_TEMPLATE_URL.replace( "YYYY", str(date.year), ) r = self._request(url, verbose) root = ET.fromstring(r.content) ns = NAMESPACES_FOR_XML data = [] # Iterate through each day for day_data in root.findall(".//DailyData", ns): date = ( day_data.find("Day", ns).text if day_data.find("Day", ns) is not None else None ) # Iterate through each hour of the day for hourly_data in day_data.findall("HourlyData", ns): hour = ( hourly_data.find("Hour", ns).text if hourly_data.find("Hour", ns) is not None else None ) # Initialize fuel type outputs fuel_outputs = { "NUCLEAR": 0, "GAS": 0, "HYDRO": 0, "WIND": 0, "SOLAR": 0, "BIOFUEL": 0, } # Extracting output for each fuel type for fuel_total in hourly_data.findall("FuelTotal", ns): fuel_type = ( fuel_total.find("Fuel", ns).text if fuel_total.find("Fuel", ns) is not None else None ) output = ( fuel_total.find(".//Output", ns).text if fuel_total.find(".//Output", ns) is not None else 0 ) if fuel_type in fuel_outputs: fuel_outputs[fuel_type] = float(output) # Adding the row to the data list row = [date, hour] + list(fuel_outputs.values()) data.append(row) columns = ["Date", "Hour"] + list(fuel_outputs.keys()) columns = [c.title() for c in columns] # Creating the DataFrame with the adjusted parsing logic df = pd.DataFrame(data, columns=columns) df["Interval Start"] = ( pd.to_datetime(df["Date"]) + pd.to_timedelta( # Subtract 1 from the hour because hour 1 is from 00:00 - 01:00 df["Hour"].astype(int) - 1, unit="h", ) ).dt.tz_localize(self.default_timezone) df["Interval End"] = df["Interval Start"] + pd.Timedelta(hours=1) return utils.move_cols_to_front( df, [ "Interval Start", "Interval End", "Nuclear", "Gas", "Hydro", "Wind", "Solar", "Biofuel", ], ).drop(columns=["Date", "Hour"]) # Function to extract data for a specific Market Quantity considering namespace def _extract_load_in_market_quantity( self, market_quantity_element: ET.Element, market_quantity_name: str, ): for mq in market_quantity_element.findall("MQ", NAMESPACES_FOR_XML): market_quantity = mq.find("MarketQuantity", NAMESPACES_FOR_XML).text if market_quantity_name in market_quantity: return mq.find("EnergyMW", NAMESPACES_FOR_XML).text return None # Function to find all triples of 'Interval', 'Market Total Load', and # 'Ontario Load' in the XML file def _find_loads_at_each_interval_from_xml(self, root_element: ET.Element): interval_load_demand_triples = [] for interval_energy in root_element.findall( "DocBody/Energies/IntervalEnergy", NAMESPACES_FOR_XML, ): interval = interval_energy.find("Interval", NAMESPACES_FOR_XML).text market_total_load = self._extract_load_in_market_quantity( interval_energy, "Total Energy", ) ontario_load = self._extract_load_in_market_quantity( interval_energy, "ONTARIO DEMAND", ) if market_total_load and ontario_load: interval_load_demand_triples.append( [int(interval), float(market_total_load), float(ontario_load)], ) return interval_load_demand_triples def _request(self, url: str, verbose: bool = False): logger.info(f"Fetching URL: {url}") max_retries = 3 retry_num = 0 sleep = 5 # This URL is missing a complete certificate chain. The browser knows how # to retrieve the intermediate certificates, but requests does not. Therefore, # we need to provide the certificate chain manually (intermediate and root). if "www.ieso.ca" in url: tls_verify = CERTIFICATES_CHAIN_FILE else: tls_verify = True while retry_num < max_retries: r = requests.get(url, verify=tls_verify) if r.ok: break retry_num += 1 logger.info(f"Request failed. Error: {r.reason}. Retrying {retry_num}...") time.sleep(sleep) sleep *= 2 if not r.ok: raise Exception( f"Failed to retrieve data from {url} in {max_retries} tries.", ) return r @support_date_range(frequency="DAY_START")
[docs] def get_resource_adequacy_report( self, date: str | datetime.date | datetime.datetime, end: datetime.date | datetime.datetime | None = None, vintage: Literal["all", "latest"] = "latest", last_modified: str | datetime.date | datetime.datetime | None = None, ) -> pd.DataFrame: """Retrieve and parse the Resource Adequacy Report for a given date. Args: date (str | datetime.date | datetime.datetime): The date for which to get the report end (datetime.date | datetime.datetime | None): The end date for the range of reports to get vintage (Literal["all", "latest"]): The version of the report to get last_modified (str | datetime.date | datetime.datetime | None): The last modified time after which to get report(s) Returns: pd.DataFrame: The Resource Adequacy Report df for the given date """ if last_modified: last_modified = utils._handle_date(last_modified, tz=self.default_timezone) if vintage == "latest": json_data, file_last_modified = self._get_latest_resource_adequacy_json( date, last_modified, ) df = self._parse_resource_adequacy_report(json_data) df["Last Modified"] = file_last_modified elif vintage == "all": json_data_with_times = self._get_all_resource_adequacy_jsons( date, last_modified, ) dfs = [] for json_data, file_last_modified in json_data_with_times: df = self._parse_resource_adequacy_report(json_data) df["Last Modified"] = file_last_modified dfs.append(df) df = pd.concat(dfs) df = utils.move_cols_to_front( df, [ "Interval Start", "Interval End", "Publish Time", "Last Modified", ], ) logger.debug(f"DataFrame Shape: {df.shape}") return df.sort_values(["Interval Start", "Publish Time", "Last Modified"])
# Note(Kladar): This might be fairly generalizable to other XML reports from IESO def _get_latest_resource_adequacy_json( self, date: str | datetime.date | datetime.datetime, last_modified: pd.Timestamp | None = None, ) -> tuple[dict, datetime.datetime]: """Retrieve the Resource Adequacy Report for a given date and convert to JSON. There are often many files for a given date, so this function will return the file with the highest version number. It does not retrieve arbitrary files of lower version numbers. Args: date (str | datetime.date | datetime.datetime): The date for which to get the report last_modified (str | datetime.date | datetime.datetime | None): The last modified time after which to get report(s) Returns: tuple[dict, datetime.datetime]: The Resource Adequacy Report JSON and its last modified time """ base_url = "https://reports-public.ieso.ca/public/Adequacy2" if isinstance(date, (datetime.datetime, datetime.date)): date_str = date.strftime("%Y%m%d") else: date_str = date.replace("-", "") file_prefix = f"PUB_Adequacy2_{date_str}" r = self._request(base_url) files = re.findall(f'href="({file_prefix}.*?.xml)"', r.text) last_modified_times = re.findall(r"(\d{2}-\w{3}-\d{4} \d{2}:\d{2})", r.text) files_and_times = zip(files, last_modified_times) if not files: raise FileNotFoundError( f"No resource adequacy files found for date {date_str}", ) if last_modified: if last_modified.tz is None: last_modified = utils._handle_date( last_modified, tz=self.default_timezone, ) filtered_files = [ (file, time) for file, time in files_and_times if pd.Timestamp(time, tz=self.default_timezone) >= last_modified ] logger.info( f"Found {len(filtered_files)} files after last modified time {last_modified}", ) else: filtered_files = list(files_and_times) if not filtered_files: raise FileNotFoundError( f"No files found for date {date_str} after last modified time {last_modified}", ) unversioned_file = next( ((f, t) for f, t in filtered_files if "_v" not in f), None, ) if unversioned_file: latest_file, file_time = unversioned_file else: latest_file, file_time = max( filtered_files, key=lambda x: int(x[0].split("_v")[-1].replace(".xml", "")), ) logger.info(f"Latest file: {latest_file}") url = f"{base_url}/{latest_file}" r = self._request(url) json_data = xmltodict.parse(r.text) last_modified_time = pd.Timestamp(file_time, tz=self.default_timezone) return json_data, last_modified_time def _fetch_and_parse_file(self, base_url: str, file: str) -> dict: url = f"{base_url}/{file}" r = self._request(url) return xmltodict.parse(r.text) def _get_all_resource_adequacy_jsons( self, date: str | datetime.date | datetime.datetime, last_modified: pd.Timestamp | None = None, ) -> list[tuple[dict, datetime.datetime]]: """Retrieve all Resource Adequacy Report JSONs for a given date. There are often many files for a given date, so this function will return all files, the data of which may be separated by publish time. Args: date (str | datetime.date | datetime.datetime): The date for which to get the report last_modified (str | datetime.date | datetime.datetime | None): The last modified time after which to get report(s) Returns: dict: The Resource Adequacy Report JSON for the given date """ base_url = "https://reports-public.ieso.ca/public/Adequacy2" if isinstance(date, (datetime.datetime, datetime.date)): date_str = date.strftime("%Y%m%d") else: date_str = date.replace("-", "") file_prefix = f"PUB_Adequacy2_{date_str}" r = self._request(base_url) pattern = '<a href="({}.*?.xml)">.*?</a>\\s+(\\d{{2}}-\\w{{3}}-\\d{{4}} \\d{{2}}:\\d{{2}})' file_rows = re.findall(pattern.format(file_prefix), r.text) if not file_rows: raise FileNotFoundError( f"No resource adequacy files found for date {date_str}", ) if last_modified: if last_modified.tz is None: last_modified = utils._handle_date( last_modified, tz=self.default_timezone, ) filtered_files = [ (file, time) for file, time in file_rows if pd.Timestamp(time, tz=self.default_timezone) >= last_modified ] logger.info( f"Found {len(filtered_files)} files after last modified time {last_modified}", ) else: filtered_files = file_rows if not filtered_files: raise FileNotFoundError( f"No files found for date {date_str} after last modified time {last_modified}", ) json_data_with_times = [] with ThreadPoolExecutor(max_workers=min(10, len(filtered_files))) as executor: future_to_file = { executor.submit(self._fetch_and_parse_file, base_url, file): ( file, time, ) for file, time in filtered_files } for future in as_completed(future_to_file): file, time = future_to_file[future] try: json_data = future.result() json_data_with_times.append( (json_data, pd.Timestamp(time, tz=self.default_timezone)), ) except Exception as e: logger.error(f"Error processing file {file}: {str(e)}") return json_data_with_times def _parse_resource_adequacy_report(self, json_data: dict) -> pd.DataFrame: """Parse the Resource Adequacy Report JSON into DataFrames.""" document_body = json_data["Document"]["DocBody"] report_data = [] data_map = self._get_resource_adequacy_data_structure_map() # TODO(Kladar): this is clunky and could definitely be generalized to reduce # linecount, but it works for now. I kind of move around the report JSON to where I want # to extract data and then extract it, and that movement could be abstracted away # NOTE(kladar): suggested libraries that does this sort of thing are `dpath` and `glom` https://github.com/mahmoud/glom def get_nested_data(data: dict, path: list[str]) -> dict: """Helper function to traverse nested data using a path.""" for key in path: data = data[key] return data for section_name, section_data in data_map.items(): if "hourly" in section_data: for metric_name, config in section_data["hourly"].items(): self._extract_hourly_values( data=document_body, path=config["path"], column_name=metric_name, value_key=config["value_key"], report_data=report_data, ) if "fuel_type_hourly" in section_data: fuel_type_config = section_data["fuel_type_hourly"] resources = list( get_nested_data(document_body, fuel_type_config["path"]), ) for resource in resources: fuel_type = resource.get("FuelType") if fuel_type in fuel_type_config["resources"]: metrics = fuel_type_config["resources"][fuel_type] for metric, config in metrics.items(): self._extract_hourly_values( data=resource, path=config["path"], column_name=f"{fuel_type} {metric}", value_key=config["value_key"], report_data=report_data, ) for zonal_section in ["zonal_import_hourly", "zonal_export_hourly"]: if zonal_section in section_data: zonal_config = section_data[zonal_section] zones = get_nested_data(document_body, zonal_config["path"]) if not isinstance(zones, list): zones = [zones] for zone in zones: zone_name = zone.get("ZoneName") if zone_name in zonal_config["zones"]: metrics = zonal_config["zones"][zone_name] for metric, config in metrics.items(): self._extract_hourly_values( data=zone, path=config["path"], column_name=f"{zone_name} {metric}", value_key=config["value_key"], report_data=report_data, ) if "total_internal_resources" in section_data: total_internal_resources_config = section_data[ "total_internal_resources" ] total_resources = get_nested_data( document_body, total_internal_resources_config["path"], ) for section_name, config in total_internal_resources_config[ "sections" ].items(): self._extract_hourly_values( data=total_resources, path=config["path"], column_name=section_name, value_key=config["value_key"], report_data=report_data, ) if "total_imports" in section_data: total_imports_config = section_data["total_imports"] total_imports = get_nested_data( document_body, total_imports_config["path"], ) for metric, config in total_imports_config["metrics"].items(): self._extract_hourly_values( data=total_imports, path=config["path"], column_name=f"Total Imports {metric}", value_key=config["value_key"], report_data=report_data, ) if "total_exports" in section_data: total_exports_config = section_data["total_exports"] total_exports = get_nested_data( document_body, total_exports_config["path"], ) for metric, config in total_exports_config["metrics"].items(): self._extract_hourly_values( data=total_exports, path=config["path"], column_name=f"Total Exports {metric}", value_key=config["value_key"], report_data=report_data, ) if "reserves" in section_data: reserves_config = section_data["reserves"] reserves = get_nested_data(document_body, reserves_config["path"]) for section_name, config in reserves_config["sections"].items(): self._extract_hourly_values( data=reserves, path=config["path"], column_name=section_name, value_key=config["value_key"], report_data=report_data, ) if "ontario_demand" in section_data: ontario_demand_config = section_data["ontario_demand"] ontario_demand = get_nested_data( document_body, ontario_demand_config["path"], ) for section_name, config in ontario_demand_config["sections"].items(): if "sections" in config: continue self._extract_hourly_values( data=ontario_demand, path=config["path"], column_name=section_name, value_key=config["value_key"], report_data=report_data, ) for ontario_demand_btm in [ "Dispatchable Load", "Hourly Demand Response", ]: btm_config = ontario_demand_config["sections"][ontario_demand_btm] btm_data = ontario_demand[ontario_demand_btm.replace(" ", "")] for section_name, config in btm_config["sections"].items(): self._extract_hourly_values( data=btm_data, path=config["path"], column_name=section_name, value_key=config["value_key"], report_data=report_data, ) # NOTE(kladar): This is the first place where pandas is truly invoked, leaving it open for more modern # dataframe libraries to be swapped in in the future df = pd.DataFrame(report_data) publish_time = pd.Timestamp( json_data["Document"]["DocHeader"]["CreatedAt"], tz=self.default_timezone, ) delivery_date = pd.Timestamp( document_body["DeliveryDate"], tz=self.default_timezone, ) logger.debug(f"Publish Time: {publish_time}") logger.debug(f"Delivery Date: {delivery_date}") df["Interval Start"] = delivery_date + pd.to_timedelta( df["DeliveryHour"] - 1, unit="h", ) df["Interval End"] = df["Interval Start"] + pd.Timedelta(hours=1) df["Publish Time"] = publish_time df = df.drop(columns=["DeliveryHour"]) return df # TODO(Kladar): this could likely be developed from the XML structure, but this works for now # and is easier to modify and quite legible def _get_resource_adequacy_data_structure_map(self) -> dict: """Define mapping of hourly data locations and extraction rules""" return { "supply": { "hourly": { "Forecast Supply Capacity": { "path": ["ForecastSupply", "Capacities", "Capacity"], "value_key": "EnergyMW", }, "Forecast Supply Energy MWh": { "path": ["ForecastSupply", "Energies", "Energy"], "value_key": "EnergyMWhr", }, "Forecast Supply Bottled Capacity": { "path": ["ForecastSupply", "BottledCapacities", "Capacity"], "value_key": "EnergyMW", }, "Forecast Supply Regulation": { "path": ["ForecastSupply", "Regulations", "Regulation"], "value_key": "EnergyMW", }, "Total Forecast Supply": { "path": ["ForecastSupply", "TotalSupplies", "Supply"], "value_key": "EnergyMW", }, "Total Requirement": { "path": ["ForecastDemand", "TotalRequirements", "Requirement"], "value_key": "EnergyMW", }, "Capacity Excess Shortfall": { "path": ["ForecastDemand", "ExcessCapacities", "Capacity"], "value_key": "EnergyMW", }, "Energy Excess Shortfall MWh": { "path": ["ForecastDemand", "ExcessEnergies", "Energy"], "value_key": "EnergyMWhr", }, "Offered Capacity Excess Shortfall": { "path": [ "ForecastDemand", "ExcessOfferedCapacities", "Capacity", ], "value_key": "EnergyMW", }, "Resources Not Scheduled": { "path": [ "ForecastDemand", "UnscheduledResources", "UnscheduledResource", ], "value_key": "EnergyMW", }, "Imports Not Scheduled": { "path": [ "ForecastDemand", "UnscheduledImports", "UnscheduledImport", ], "value_key": "EnergyMW", }, }, "fuel_type_hourly": { "path": ["ForecastSupply", "InternalResources", "InternalResource"], "resources": { "Nuclear": { "Capacity": { "path": ["Capacities", "Capacity"], "value_key": "EnergyMW", }, "Outages": { "path": ["Outages", "Outage"], "value_key": "EnergyMW", }, "Offered": { "path": ["Offers", "Offer"], "value_key": "EnergyMW", }, "Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Gas": { "Capacity": { "path": ["Capacities", "Capacity"], "value_key": "EnergyMW", }, "Outages": { "path": ["Outages", "Outage"], "value_key": "EnergyMW", }, "Offered": { "path": ["Offers", "Offer"], "value_key": "EnergyMW", }, "Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Hydro": { "Capacity": { "path": ["Capacities", "Capacity"], "value_key": "EnergyMW", }, "Outages": { "path": ["Outages", "Outage"], "value_key": "EnergyMW", }, "Forecasted MWh": { "path": ["ForecastEnergies", "ForecastEnergy"], "value_key": "EnergyMWhr", }, "Offered": { "path": ["Offers", "Offer"], "value_key": "EnergyMW", }, "Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Wind": { "Capacity": { "path": ["Capacities", "Capacity"], "value_key": "EnergyMW", }, "Outages": { "path": ["Outages", "Outage"], "value_key": "EnergyMW", }, "Forecasted": { "path": ["Forecasts", "Forecast"], "value_key": "EnergyMW", }, "Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Solar": { "Capacity": { "path": ["Capacities", "Capacity"], "value_key": "EnergyMW", }, "Outages": { "path": ["Outages", "Outage"], "value_key": "EnergyMW", }, "Forecasted": { "path": ["Forecasts", "Forecast"], "value_key": "EnergyMW", }, "Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Biofuel": { "Capacity": { "path": ["Capacities", "Capacity"], "value_key": "EnergyMW", }, "Outages": { "path": ["Outages", "Outage"], "value_key": "EnergyMW", }, "Offered": { "path": ["Offers", "Offer"], "value_key": "EnergyMW", }, "Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Other": { "Capacity": { "path": ["Capacities", "Capacity"], "value_key": "EnergyMW", }, "Outages": { "path": ["Outages", "Outage"], "value_key": "EnergyMW", }, "Offered Forecasted": { "path": ["OfferForecasts", "OfferForecast"], "value_key": "EnergyMW", }, "Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, }, }, "total_internal_resources": { "path": [ "ForecastSupply", "InternalResources", "TotalInternalResources", ], "sections": { "Total Internal Resources Outages": { "path": ["Outages", "Outage"], "value_key": "EnergyMW", }, "Total Internal Resources Offered Forecasted": { "path": ["OfferForecasts", "OfferForecast"], "value_key": "EnergyMW", }, "Total Internal Resources Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, }, "zonal_import_hourly": { "path": ["ForecastSupply", "ZonalImports", "ZonalImport"], "zones": { "Manitoba": { "Imports Offered": { "path": ["Offers", "Offer"], "value_key": "EnergyMW", }, "Imports Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Minnesota": { "Imports Offered": { "path": ["Offers", "Offer"], "value_key": "EnergyMW", }, "Imports Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Michigan": { "Imports Offered": { "path": ["Offers", "Offer"], "value_key": "EnergyMW", }, "Imports Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "New York": { "Imports Offered": { "path": ["Offers", "Offer"], "value_key": "EnergyMW", }, "Imports Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Quebec": { "Imports Offered": { "path": ["Offers", "Offer"], "value_key": "EnergyMW", }, "Imports Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, }, }, "total_imports": { "path": ["ForecastSupply", "ZonalImports", "TotalImports"], "metrics": { "Offers": { "path": ["Offers", "Offer"], "value_key": "EnergyMW", }, "Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, "Estimated": { "path": ["Estimates", "Estimate"], "value_key": "EnergyMW", }, "Capacity": { "path": ["Capacities", "Capacity"], "value_key": "EnergyMW", }, }, }, }, "demand": { "ontario_demand": { "path": ["ForecastDemand", "OntarioDemand"], "sections": { "Ontario Demand Forecast": { "path": ["ForecastOntDemand", "Demand"], "value_key": "EnergyMW", }, "Ontario Peak Demand": { "path": ["PeakDemand", "Demand"], "value_key": "EnergyMW", }, "Ontario Average Demand": { "path": ["AverageDemand", "Demand"], "value_key": "EnergyMW", }, "Ontario Wind Embedded Forecast": { "path": ["WindEmbedded", "Embedded"], "value_key": "EnergyMW", }, "Ontario Solar Embedded Forecast": { "path": ["SolarEmbedded", "Embedded"], "value_key": "EnergyMW", }, "Dispatchable Load": { "sections": { "Ontario Dispatchable Load Capacity": { "path": ["Capacities", "Capacity"], "value_key": "EnergyMW", }, "Ontario Dispatchable Load Bid Forecasted": { "path": ["BidForecasts", "BidForecast"], "value_key": "EnergyMW", }, "Ontario Dispatchable Load Scheduled ON": { "path": ["ScheduledON", "Schedule"], "value_key": "EnergyMW", }, "Ontario Dispatchable Load Scheduled OFF": { "path": ["ScheduledOFF", "Schedule"], "value_key": "EnergyMW", }, }, }, "Hourly Demand Response": { "sections": { "Ontario Hourly Demand Response Bid Forecasted": { "path": ["Bids", "Bid"], "value_key": "EnergyMW", }, "Ontario Hourly Demand Response Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, "Ontario Hourly Demand Response Curtailed": { "path": ["Curtailed", "Curtail"], "value_key": "EnergyMW", }, }, }, }, }, "zonal_export_hourly": { "path": ["ForecastDemand", "ZonalExports", "ZonalExport"], "zones": { "Manitoba": { "Exports Offered": { "path": ["Bids", "Bid"], "value_key": "EnergyMW", }, "Exports Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Minnesota": { "Exports Offered": { "path": ["Bids", "Bid"], "value_key": "EnergyMW", }, "Exports Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Michigan": { "Exports Offered": { "path": ["Bids", "Bid"], "value_key": "EnergyMW", }, "Exports Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "New York": { "Exports Offered": { "path": ["Bids", "Bid"], "value_key": "EnergyMW", }, "Exports Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, "Quebec": { "Exports Offered": { "path": ["Bids", "Bid"], "value_key": "EnergyMW", }, "Exports Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, }, }, }, "total_exports": { "path": ["ForecastDemand", "ZonalExports", "TotalExports"], "metrics": { "Bids": { "path": ["Bids", "Bid"], "value_key": "EnergyMW", }, "Scheduled": { "path": ["Schedules", "Schedule"], "value_key": "EnergyMW", }, "Capacity": { "path": ["Capacities", "Capacity"], "value_key": "EnergyMW", }, }, }, "reserves": { "path": ["ForecastDemand", "GenerationReserveHoldback"], "sections": { "Total Operating Reserve": { "path": ["TotalORReserve", "ORReserve"], "value_key": "EnergyMW", }, "Minimum 10 Minute Operating Reserve": { "path": ["Min10MinOR", "Min10OR"], "value_key": "EnergyMW", }, "Minimum 10 Minute Spin OR": { "path": ["Min10MinSpinOR", "Min10SpinOR"], "value_key": "EnergyMW", }, "Load Forecast Uncertainties": { "path": ["LoadForecastUncertainties", "Uncertainty"], "value_key": "EnergyMW", }, "Additional Contingency Allowances": { "path": ["ContingencyAllowances", "Allowance"], "value_key": "EnergyMW", }, }, }, }, } def _extract_hourly_values( self, data: dict, path: list[str], column_name: str, value_key: str, report_data: list[dict], ) -> None: """Extract hourly values from nested json data into report_data list, which becomes a dataframe later. Args: data: Source data dictionary path: List of keys to traverse to reach hourly data (e.g. ["Capacities", "Capacity"]) column_name: Name for the extracted data column value_key: Key containing the value to extract (e.g. "EnergyMW") report_data: List to store extracted hourly data rows """ current = data for key in path[:-1]: if key not in current: logger.debug( f"Path segment {path} has no key '{key}' in the data structure. Investigate the report data map definition.", ) return current = current[key] items = current.get(path[-1], []) if items is None: items = [] elif not isinstance(items, list): items = [items] existing_hours = {row["DeliveryHour"] for row in report_data} for hour in range(1, 25): if hour not in existing_hours: report_data.append({"DeliveryHour": hour}) hours_with_values = set() for item in items: if item is None: continue hour = int(item["DeliveryHour"]) hours_with_values.add(hour) row = next(r for r in report_data if r["DeliveryHour"] == hour) try: value = item.get(value_key) row[column_name] = float(value) if value is not None else None except (ValueError, TypeError): row[column_name] = None missing_value_hours = set(range(1, 25)) - hours_with_values if missing_value_hours: logger.debug( f"Detected {len(missing_value_hours)} hours without values for column {column_name}. Filling in with None.", ) for hour in range(1, 25): if hour not in hours_with_values: row = next(r for r in report_data if r["DeliveryHour"] == hour) row[column_name] = None