diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f94db7a352..0f0fa0fb60 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - --force-single-line-imports - --profile black - repo: https://github.com/asottile/pyupgrade # Upgrade Python syntax - rev: v3.15.2 + rev: v3.21.2 hooks: - id: pyupgrade args: diff --git a/_includes/head.html b/_includes/head.html index 46e2058785..5fa233c0f8 100644 --- a/_includes/head.html +++ b/_includes/head.html @@ -89,6 +89,7 @@ twitter: {{ conf.twitter | jsonify }}, mastodon: {{ conf.mastodon | jsonify }}, bluesky: {{ conf.bluesky | jsonify }}, + youtube: {{ conf.youtube | jsonify }}, location: {{ conf.location | jsonify }}, extra_places: {{ conf.extra_places | jsonify }}, workshop_deadline: {{ conf.workshop_deadline | jsonify }}, diff --git a/_includes/index_conf_title_row.html b/_includes/index_conf_title_row.html index 765dd9b7da..0400e4b1d9 100644 --- a/_includes/index_conf_title_row.html +++ b/_includes/index_conf_title_row.html @@ -23,6 +23,9 @@ {% elsif conf.twitter %} Twitter {% endif %} + {% if conf.bluesky %} + + {% endif %} diff --git a/_layouts/conference.html b/_layouts/conference.html index 1865c52fd4..50959ba31d 100644 --- a/_layouts/conference.html +++ b/_layouts/conference.html @@ -162,6 +162,18 @@

a.k.a. {{page.alt_name}} {{page.year}}

Mastodon {% endif %} + {% if page.bluesky %} +
+ + Bluesky +
+ {% endif %} + {% if page.youtube %} +
+ + YouTube +
+ {% endif %}
diff --git a/_layouts/summary.html b/_layouts/summary.html index 80dac2620f..9f796ca556 100644 --- a/_layouts/summary.html +++ b/_layouts/summary.html @@ -71,6 +71,18 @@

Mastodon

{% endif %} + {% if confs[0].bluesky %} +
+ + Bluesky +
+ {% endif %} + {% if confs[0].youtube %} +
+ + YouTube +
+ {% endif %}
diff --git a/tests/test_youtube_extraction.py b/tests/test_youtube_extraction.py new file mode 100644 index 0000000000..87da528d1f --- /dev/null +++ b/tests/test_youtube_extraction.py @@ -0,0 +1,99 @@ +"""Tests for YouTube link extraction and Mastodon/YouTube disambiguation.""" + +import sys +from pathlib import Path +from unittest.mock import patch + +sys.path.append(str(Path(__file__).parent.parent / "utils")) + +from enrich_tba import extract_links_from_url + + +class TestYouTubeExtraction: + """Test YouTube link detection in extract_links_from_url.""" + + @patch("enrich_tba.get_all_links") + def test_youtube_channel_detected(self, mock_links): + """YouTube /@channel links are detected as youtube, not mastodon.""" + mock_links.return_value = [ + "https://www.youtube.com/@PyConUS", + ] + result = extract_links_from_url("https://pycon.org") + assert "youtube" in result + assert result["youtube"] == "https://www.youtube.com/@PyConUS" + assert "mastodon" not in result + + @patch("enrich_tba.get_all_links") + def test_youtube_channel_url_without_at(self, mock_links): + """YouTube channel links without @ are detected.""" + mock_links.return_value = [ + "https://www.youtube.com/channel/UCMjMBMGt0WP2usFilILnbcA", + ] + result = extract_links_from_url("https://pycon.org") + assert "youtube" in result + assert "mastodon" not in result + + @patch("enrich_tba.get_all_links") + def test_youtube_not_mistaken_for_mastodon(self, mock_links): + """YouTube /@username must not end up in mastodon field.""" + mock_links.return_value = [ + "https://www.youtube.com/@EuroPython", + "https://fosstodon.org/@europython", + ] + result = extract_links_from_url("https://europython.eu") + assert result.get("youtube") == "https://www.youtube.com/@EuroPython" + assert result.get("mastodon") == "https://fosstodon.org/@europython" + + @patch("enrich_tba.get_all_links") + def test_youtu_be_short_link(self, mock_links): + """Short youtu.be links are detected as youtube.""" + mock_links.return_value = [ + "https://youtu.be/abc123", + ] + result = extract_links_from_url("https://pycon.org") + assert "youtube" in result + assert "mastodon" not in result + + @patch("enrich_tba.get_all_links") + def test_mastodon_still_works(self, mock_links): + """Mastodon links on known instances still detected correctly.""" + mock_links.return_value = [ + "https://fosstodon.org/@pycon", + ] + result = extract_links_from_url("https://pycon.org") + assert "mastodon" in result + assert result["mastodon"] == "https://fosstodon.org/@pycon" + assert "youtube" not in result + + @patch("enrich_tba.get_all_links") + def test_generic_mastodon_still_works(self, mock_links): + """Generic /@username on unknown instances still detected as mastodon.""" + mock_links.return_value = [ + "https://social.example.org/@pyconf", + ] + result = extract_links_from_url("https://pyconf.org") + assert "mastodon" in result + assert "youtube" not in result + + @patch("enrich_tba.get_all_links") + def test_youtube_first_seen_wins(self, mock_links): + """Only the first YouTube link is kept.""" + mock_links.return_value = [ + "https://www.youtube.com/@PyConUS", + "https://www.youtube.com/@AnotherChannel", + ] + result = extract_links_from_url("https://pycon.org") + assert result["youtube"] == "https://www.youtube.com/@PyConUS" + + @patch("enrich_tba.get_all_links") + def test_all_social_links_extracted(self, mock_links): + """YouTube, Mastodon, and Bluesky can all be extracted together.""" + mock_links.return_value = [ + "https://bsky.app/profile/pycon.org", + "https://www.youtube.com/@PyConUS", + "https://fosstodon.org/@pycon", + ] + result = extract_links_from_url("https://pycon.org") + assert "bluesky" in result + assert "youtube" in result + assert "mastodon" in result diff --git a/utils/enrich_tba.py b/utils/enrich_tba.py index 6bff324463..bed4ec4bcf 100644 --- a/utils/enrich_tba.py +++ b/utils/enrich_tba.py @@ -50,7 +50,7 @@ MAX_CONTENT_LENGTH = 15000 # Max characters per conference website # Field type categorization for validation -URL_FIELDS = {"sponsor", "finaid", "mastodon", "bluesky", "cfp_link"} +URL_FIELDS = {"sponsor", "finaid", "mastodon", "bluesky", "youtube", "cfp_link"} DATE_FIELDS = {"cfp", "workshop_deadline", "tutorial_deadline"} TIMEZONE_FIELD = "timezone" @@ -267,6 +267,11 @@ def get_all_links(url: str) -> list[str]: return [] +def _domain_matches(domain: str, hosts: tuple[str, ...]) -> bool: + """Return True if domain equals one of hosts or is a subdomain of one.""" + return any(domain == h or domain.endswith(f".{h}") for h in hosts) + + # Known Mastodon instances (common ones in tech/Python community) MASTODON_INSTANCES = { "mastodon.social", @@ -334,6 +339,10 @@ def extract_links_from_url(url: str) -> dict[str, str]: for link in links: link_lower = link.lower() parsed_link = urlparse(link) + link_domain = parsed_link.netloc.lower() + + is_youtube = _domain_matches(link_domain, ("youtube.com", "youtu.be")) + is_twitter = _domain_matches(link_domain, ("twitter.com", "x.com")) # Bluesky - always bsky.app/profile/ if "bluesky" not in seen_types and "bsky.app/profile/" in link_lower: @@ -341,15 +350,19 @@ def extract_links_from_url(url: str) -> dict[str, str]: seen_types.add("bluesky") logger.debug(f" Found bluesky: {link}") + # YouTube - youtube.com/@channel or youtu.be links + elif "youtube" not in seen_types and is_youtube: + found["youtube"] = link + seen_types.add("youtube") + logger.debug(f" Found youtube: {link}") + # Mastodon - /@username pattern on known instances or any instance - # Exclude Twitter/X which don't use /@, but guard against edge cases + # Exclude Twitter/X and YouTube which also use /@username patterns elif "mastodon" not in seen_types and "/@" in link: - domain = parsed_link.netloc.lower() - - # Skip Twitter/X domains (exact host or subdomains only) - if domain == "twitter.com" or domain.endswith((".x.com", ".twitter.com")) or domain == "x.com": + # Skip Twitter/X and YouTube domains + if is_twitter or is_youtube: pass - elif domain in MASTODON_INSTANCES or "mastodon" in domain or "toot" in domain: + elif link_domain in MASTODON_INSTANCES or "mastodon" in link_domain or "toot" in link_domain: found["mastodon"] = link seen_types.add("mastodon") logger.debug(f" Found mastodon: {link}") diff --git a/utils/schema.yml b/utils/schema.yml index 9ad7e5b782..9443815202 100644 --- a/utils/schema.yml +++ b/utils/schema.yml @@ -18,6 +18,7 @@ twitter: BestConfEver # Twitter handle of conference (Optional) mastodon: https://mastodon.social/@bconf # Mastodon handle of conference (Optional) bluesky: https://bsky.app/@bconf # Bluesky handle of conference (Optional) + youtube: https://www.youtube.com/@bconf # YouTube channel of conference (Optional) sub: PY # Type of conference (see or add _data/types.yml) note: Important # In case there are extra notes about the conference (Optional) location: # Geolocation for inclusion in map diff --git a/utils/tidy_conf/schema.py b/utils/tidy_conf/schema.py index 11885c8736..ea8e391b78 100644 --- a/utils/tidy_conf/schema.py +++ b/utils/tidy_conf/schema.py @@ -72,6 +72,7 @@ class Conference(BaseModel): twitter: str | None = None mastodon: HttpUrl | None = None bluesky: str | None = None + youtube: HttpUrl | None = None sub: str note: str | None = None location: list[Location] | None = None @@ -121,7 +122,7 @@ def validate_title(cls, v): return re.sub(r"\b(19|20)\d{2}\b", "", v).strip() return v - @field_serializer("link", "cfp_link", "sponsor", "finaid", "mastodon") + @field_serializer("link", "cfp_link", "sponsor", "finaid", "mastodon", "youtube") def ser_url(self, value): return str(value) diff --git a/utils/tidy_conf/validation.py b/utils/tidy_conf/validation.py index 84cab7a1af..46a9dd1ee1 100644 --- a/utils/tidy_conf/validation.py +++ b/utils/tidy_conf/validation.py @@ -33,6 +33,7 @@ "twitter", "mastodon", "bluesky", + "youtube", "location", "extra_places", ]