Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions utils/import_python_official.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,66 @@
logger = get_tqdm_logger(__name__)


def fill_links_from_history(df_ics: pd.DataFrame, df_yml: pd.DataFrame) -> pd.DataFrame:
"""Fill missing links in ICS data from historical conference data.

For conferences without links, look up the conference name in historical data
and use that link, replacing any year references with the current year.

Parameters
----------
df_ics : pd.DataFrame
DataFrame with ICS conference data (may have empty links)
df_yml : pd.DataFrame
DataFrame with existing conference data from YAML files

Returns
-------
pd.DataFrame
DataFrame with missing links filled where historical data exists
"""
if df_yml.empty:
return df_ics

# Create a lookup of conference names to their most recent links
# Group by normalized conference name and get the most recent entry
historical_links = {}
for _, row in df_yml.iterrows():
conf_name = row.get("conference", "")
link = row.get("link", "")
year = row.get("year", 0)

# Keep the most recent link for each conference
if conf_name and link and (conf_name not in historical_links or year > historical_links[conf_name][1]):
historical_links[conf_name] = (link, year)

filled_count = 0
for idx, row in df_ics.iterrows():
link = row.get("link", "")
if not link or len(str(link).strip()) == 0:
conf_name = row.get("conference", "")
target_year = row.get("year", datetime.now(tz=timezone.utc).year)

if conf_name in historical_links:
hist_link, hist_year = historical_links[conf_name]
# Replace the historical year with the target year in the link
new_link = re.sub(
rf"\b{hist_year}\b",
str(target_year),
str(hist_link),
)
df_ics.at[idx, "link"] = new_link
filled_count += 1
logger.debug(
f"Filled link for '{conf_name}' from historical data: {new_link}",
)

if filled_count > 0:
logger.info(f"Filled {filled_count} missing links from historical conference data")

return df_ics


def ics_to_dataframe() -> pd.DataFrame:
"""Parse an .ics file and return a DataFrame with the event data.

Expand Down Expand Up @@ -241,6 +301,21 @@ def main(year=None, base="") -> bool:
logger.warning("No conference data retrieved from calendar")
return False

# Try to fill missing links from historical conference data
logger.info("Filling missing links from historical data")
df_ics = fill_links_from_history(df_ics, df_yml)

# Filter out entries with empty or missing links
initial_count = len(df_ics)
df_ics = df_ics[df_ics["link"].str.len() > 0]
filtered_count = initial_count - len(df_ics)
if filtered_count > 0:
logger.info(f"Filtered out {filtered_count} entries without valid links")

if df_ics.empty:
logger.warning("No conferences with valid links after filtering")
return False

except Exception as e:
logger.error(f"Failed to initialize import process: {e}")
return False
Expand Down
20 changes: 12 additions & 8 deletions utils/sort_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,15 +301,19 @@ def sort_data(base="", prefix="", skip_links=False):
for i, q in enumerate(data.copy()):
data[i] = order_keywords(q)

def validate_conference(q: dict) -> Conference | None:
"""Validate a single conference entry, returning None if invalid."""
try:
return Conference(**q)
except pydantic.ValidationError as e:
logger.error(f"❌ Validation error in conference: {e}")
logger.debug(f"Invalid data: \n{yaml.dump(q, default_flow_style=False)}")
return None

logger.info("✅ Validating conference data with Pydantic schema")
validation_errors = 0

try:
new_data = [Conference(**q) for q in data]
except pydantic.ValidationError as e:
validation_errors += 1
logger.error(f"❌ Validation error in conference: {e}")
logger.debug(f"Invalid data: \n{yaml.dump(q, default_flow_style=False)}")
validated = [validate_conference(q) for q in data]
new_data = [c for c in validated if c is not None]
validation_errors = len(validated) - len(new_data)

if validation_errors > 0:
logger.warning(f"⚠️ {validation_errors} conferences failed validation and were skipped")
Expand Down