diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index ec75f22..095ffcd 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -5,6 +5,7 @@ on: branches: [master] paths: - 'src/**' + - 'tests/**' jobs: test: @@ -17,28 +18,31 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - name: Cache pip dependencies + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-pip- + - name: Install package in editable mode run: | pip install --upgrade pip pip install -e . - name: Install testing and linting tools - run: pip install pytest flake8 mypy + run: pip install pytest flake8 - - name: Cache pip dependencies - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/pyproject.toml') }} - restore-keys: | - ${{ runner.os }}-pip- + - name: Lint with flake8 + run: flake8 src/ --max-line-length=120 --count --select=E9,F63,F7,F82 --show-source --statistics - name: Run pytest run: pytest tests/ diff --git a/.gitignore b/.gitignore index f6fdb24..63be16e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -CBBpy.egg-info/ +*.egg-info/ *.ipynb *.pyc .ipynb_checkpoints/ @@ -10,4 +10,5 @@ __pycache__/ build/ dist/ *.log -*.xlsx \ No newline at end of file +*.xlsx +.vscode/ \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 934a509..cf63eac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,12 @@ dependencies = [ ] requires-python = ">=3.9" +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "flake8>=6.0.0", +] + [project.urls] homepage = "https://github.com/dcstats/CBBpy/" issues = "https://github.com/dcstats/CBBpy/issues" diff --git a/src/cbbpy/utils/cbbpy_utils.py b/src/cbbpy/utils/cbbpy_utils.py index 2fe8c22..cd8fbe3 100644 --- a/src/cbbpy/utils/cbbpy_utils.py +++ b/src/cbbpy/utils/cbbpy_utils.py @@ -177,7 +177,7 @@ def _get_games_range(start_date, end_date, game_type, info, box, pbp): date_range = pd.date_range(start_date, end_date) len_scrape = len(date_range) all_data = [] - cpus = os.cpu_count() - 1 + cpus = (os.cpu_count() or 2) - 1 if len_scrape < 1: raise InvalidDateRangeError("The start date must be sooner than the end date.") @@ -209,7 +209,7 @@ def _get_games_range(start_date, end_date, game_type, info, box, pbp): t.set_description(f"No games on {date.strftime('%D')}", refresh=False) if not len(all_data) > 0: - return () + return (pd.DataFrame(), pd.DataFrame(), pd.DataFrame()) # sort returned dataframes to ensure consistency between runs game_info_df = pd.concat([game[0] for day in all_data for game in day]) @@ -261,11 +261,15 @@ def _get_games_season(season, game_type, info, box, pbp): @print_log_file_location def _get_games_team(team, season, game_type, info, box, pbp): - cpus = os.cpu_count() - 1 + cpus = (os.cpu_count() or 2) - 1 schedule_df = _get_team_schedule(team, season, game_type) game_ids = list(schedule_df[schedule_df.game_status.isin(GOOD_GAME_STATUSES)].game_id) - print(f'Scraping {len(game_ids)} games for {schedule_df.team.iloc[0]}') + team_name = schedule_df.team.iloc[0] if len(schedule_df) > 0 else team + print(f'Scraping {len(game_ids)} games for {team_name}') + + if not game_ids: + return (pd.DataFrame(), pd.DataFrame(), pd.DataFrame()) result = Parallel(n_jobs=cpus)( delayed(_get_game)(gid, game_type, info, box, pbp) @@ -276,16 +280,16 @@ def _get_games_team(team, season, game_type, info, box, pbp): game_info_df = pd.concat([x[0] for x in result]) if info: game_info_df = game_info_df.sort_values( - by=['game_day', 'game_time', 'game_id'], - key=lambda col: pd.to_datetime(col.str.replace(r' P[SD]T', '', + by=['game_day', 'game_time', 'game_id'], + key=lambda col: pd.to_datetime(col.str.replace(r' P[SD]T', '', regex=True)) if col.name != 'game_id' else col ).reset_index(drop=True) game_boxscore_df = pd.concat([x[1] for x in result]) if box: game_boxscore_df = game_boxscore_df.sort_values( - by=['game_id', 'team'], - ascending=False, + by=['game_id', 'team'], + ascending=False, kind='mergesort' ).reset_index(drop=True) @@ -297,8 +301,6 @@ def _get_games_team(team, season, game_type, info, box, pbp): kind='mergesort' ).reset_index(drop=True) - # print(f"Log file is located at {log_file}") - return (game_info_df, game_boxscore_df, game_pbp_df) @@ -307,24 +309,24 @@ def _get_games_conference(conference, season, game_type, info, box, pbp): teams = _get_teams_from_conference(conference, season, game_type) result = [_get_games_team(x, season, game_type, info, box, pbp) for x in teams] - # sort returned dataframes to ensure consistency between runs - game_info_df = pd.concat([x[0] for x in result]) + # deduplicate intra-conference games that appear for both teams + game_info_df = pd.concat([x[0] for x in result]).drop_duplicates(subset=['game_id']) if info: game_info_df = game_info_df.sort_values( - by=['game_day', 'game_time', 'game_id'], - key=lambda col: pd.to_datetime(col.str.replace(r' P[SD]T', '', + by=['game_day', 'game_time', 'game_id'], + key=lambda col: pd.to_datetime(col.str.replace(r' P[SD]T', '', regex=True)) if col.name != 'game_id' else col ).reset_index(drop=True) - game_boxscore_df = pd.concat([x[1] for x in result]) + game_boxscore_df = pd.concat([x[1] for x in result]).drop_duplicates(subset=['game_id', 'player_id', 'team']) if box: game_boxscore_df = game_boxscore_df.sort_values( - by=['game_id', 'team'], - ascending=False, + by=['game_id', 'team'], + ascending=False, kind='mergesort' ).reset_index(drop=True) - game_pbp_df = pd.concat([x[2] for x in result]) + game_pbp_df = pd.concat([x[2] for x in result]).drop_duplicates() if pbp: game_pbp_df = game_pbp_df.sort_values( by=['game_id'], @@ -337,6 +339,7 @@ def _get_games_conference(conference, season, game_type, info, box, pbp): def _get_game_ids(date, game_type): soup = None + scoreboard = None if game_type == "mens": pre_url = MENS_SCOREBOARD_URL @@ -397,6 +400,7 @@ def _get_game_ids(date, game_type): def _get_game_boxscore(game_id, game_type): soup = None + gamepackage = None game_id = str(game_id) if game_type == "mens": @@ -470,6 +474,7 @@ def _get_game_boxscore(game_id, game_type): def _get_game_pbp(game_id, game_type): soup = None + gamepackage = None game_id = str(game_id) if game_type == "mens": @@ -534,6 +539,8 @@ def _get_game_pbp(game_id, game_type): def _get_game_info(game_id, game_type): soup = None + gamepackage = None + df = pd.DataFrame([]) game_id = str(game_id) if game_type == "mens": @@ -599,7 +606,8 @@ def _get_game_info(game_id, game_type): def _get_player_info(player_id, game_type): soup = None - df = None + raw_player = None + df = pd.DataFrame([]) if game_type == "mens": pre_url = MENS_PLAYER_URL @@ -620,11 +628,11 @@ def _get_player_info(player_id, game_type): df = _get_player_details_helper(player_id, raw_player, game_type) except Exception as ex: - if "Page not found." in soup.text: + if soup is not None and "Page not found." in soup.text: _log.error( f'{player_id} - Player: Page not found error' ) - break + return pd.DataFrame([]) if i + 1 == ATTEMPTS: # max number of attempts reached, so return blank df @@ -730,7 +738,7 @@ def _parse_date(date): for parse in DATE_PARSES: try: date = datetime.strptime(date, parse) - except: + except ValueError: continue else: parsed = True @@ -745,251 +753,72 @@ def _parse_date(date): return date -def _get_game_boxscore_helper(boxscore, game_id): - tm1, tm2 = boxscore[0], boxscore[1] - tm1_name, tm2_name = tm1["tm"]["dspNm"], tm2["tm"]["dspNm"] - tm1_stats, tm2_stats = tm1["stats"], tm2["stats"] - - labels = tm1_stats[0]["lbls"] - - tm1_starters, tm1_bench, tm1_totals = ( - tm1_stats[0]["athlts"], - tm1_stats[1]["athlts"], - tm1_stats[2]["ttls"], - ) - tm2_starters, tm2_bench, tm2_totals = ( - tm2_stats[0]["athlts"], - tm2_stats[1]["athlts"], - tm2_stats[2]["ttls"], - ) - - # starters' stats - if len(tm1_starters) > 0: - tm1_st_dict = { - labels[i].lower(): [ - tm1_starters[j]["stats"][i] for j in range(len(tm1_starters)) - ] - for i in range(len(labels)) - } - - tm1_st_pos = [ - ( - tm1_starters[i]["athlt"]["pos"] - if "pos" in tm1_starters[i]["athlt"].keys() - else "" - ) - for i in range(len(tm1_starters)) - ] - tm1_st_id = [ - ( - tm1_starters[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm1_starters[i]["athlt"].keys() - else "" - ) - for i in range(len(tm1_starters)) - ] - tm1_st_nm = [ - ( - tm1_starters[i]["athlt"]["shrtNm"] - if "shrtNm" in tm1_starters[i]["athlt"].keys() - else "" - ) - for i in range(len(tm1_starters)) - ] - - tm1_st_df = pd.DataFrame(tm1_st_dict) - tm1_st_df.insert(0, "starter", True) - tm1_st_df.insert(0, "position", tm1_st_pos) - tm1_st_df.insert(0, "player_id", tm1_st_id) - tm1_st_df.insert(0, "player", tm1_st_nm) - tm1_st_df.insert(0, "team", tm1_name) - tm1_st_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm1_st_df = pd.DataFrame(columns=cols) - - # bench players' stats - if len(tm1_bench) > 0: - tm1_bn_dict = { - labels[i].lower(): [tm1_bench[j]["stats"][i] for j in range(len(tm1_bench))] - for i in range(len(labels)) - } - - tm1_bn_pos = [ - ( - tm1_bench[i]["athlt"]["pos"] - if "pos" in tm1_bench[i]["athlt"].keys() - else "" - ) - for i in range(len(tm1_bench)) - ] - tm1_bn_id = [ - ( - tm1_bench[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm1_bench[i]["athlt"].keys() - else "" - ) - for i in range(len(tm1_bench)) - ] - tm1_bn_nm = [ - ( - tm1_bench[i]["athlt"]["shrtNm"] - if "shrtNm" in tm1_bench[i]["athlt"].keys() - else "" - ) - for i in range(len(tm1_bench)) - ] - - tm1_bn_df = pd.DataFrame(tm1_bn_dict) - tm1_bn_df.insert(0, "starter", False) - tm1_bn_df.insert(0, "position", tm1_bn_pos) - tm1_bn_df.insert(0, "player_id", tm1_bn_id) - tm1_bn_df.insert(0, "player", tm1_bn_nm) - tm1_bn_df.insert(0, "team", tm1_name) - tm1_bn_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm1_bn_df = pd.DataFrame(columns=cols) - - # team totals - if len(tm1_totals) > 0: - tm1_tot_dict = {labels[i].lower(): [tm1_totals[i]] for i in range(len(labels))} - - tm1_tot_df = pd.DataFrame(tm1_tot_dict) - tm1_tot_df.insert(0, "starter", False) - tm1_tot_df.insert(0, "position", "TOTAL") - tm1_tot_df.insert(0, "player_id", "TOTAL") - tm1_tot_df.insert(0, "player", "TEAM") - tm1_tot_df.insert(0, "team", tm1_name) - tm1_tot_df.insert(0, "game_id", game_id) - - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm1_tot_df = pd.DataFrame(columns=cols) - - tm1_df = pd.concat([tm1_st_df, tm1_bn_df, tm1_tot_df]) - - # starters' stats - if len(tm2_starters) > 0: - tm2_st_dict = { - labels[i].lower(): [ - tm2_starters[j]["stats"][i] for j in range(len(tm2_starters)) - ] - for i in range(len(labels)) - } - - tm2_st_pos = [ - ( - tm2_starters[i]["athlt"]["pos"] - if "pos" in tm2_starters[i]["athlt"].keys() - else "" - ) - for i in range(len(tm2_starters)) - ] - tm2_st_id = [ - ( - tm2_starters[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm2_starters[i]["athlt"].keys() - else "" - ) - for i in range(len(tm2_starters)) - ] - tm2_st_nm = [ - ( - tm2_starters[i]["athlt"]["shrtNm"] - if "shrtNm" in tm2_starters[i]["athlt"].keys() - else "" - ) - for i in range(len(tm2_starters)) - ] +def _extract_player_id(uid): + match = re.search(r'a:(\d+)', uid) + return match.group(1) if match else "" - tm2_st_df = pd.DataFrame(tm2_st_dict) - tm2_st_df.insert(0, "starter", True) - tm2_st_df.insert(0, "position", tm2_st_pos) - tm2_st_df.insert(0, "player_id", tm2_st_id) - tm2_st_df.insert(0, "player", tm2_st_nm) - tm2_st_df.insert(0, "team", tm2_name) - tm2_st_df.insert(0, "game_id", game_id) - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm2_st_df = pd.DataFrame(columns=cols) +def _build_team_boxscore(players, team_name, game_id, labels, is_starter): + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] + if len(players) == 0: + return pd.DataFrame(columns=cols) - # bench players' stats - if len(tm2_bench) > 0: - tm2_bn_dict = { - labels[i].lower(): [tm2_bench[j]["stats"][i] for j in range(len(tm2_bench))] - for i in range(len(labels)) - } + stat_dict = { + labels[i].lower(): [players[j]["stats"][i] for j in range(len(players))] + for i in range(len(labels)) + } + positions = [players[i]["athlt"].get("pos", "") for i in range(len(players))] + ids = [_extract_player_id(players[i]["athlt"].get("uid", "")) for i in range(len(players))] + names = [players[i]["athlt"].get("shrtNm", "") for i in range(len(players))] + + df = pd.DataFrame(stat_dict) + df.insert(0, "starter", is_starter) + df.insert(0, "position", positions) + df.insert(0, "player_id", ids) + df.insert(0, "player", names) + df.insert(0, "team", team_name) + df.insert(0, "game_id", game_id) + return df - tm2_bn_pos = [ - ( - tm2_bench[i]["athlt"]["pos"] - if "pos" in tm2_bench[i]["athlt"].keys() - else "" - ) - for i in range(len(tm2_bench)) - ] - tm2_bn_id = [ - ( - tm2_bench[i]["athlt"]["uid"].split(":")[-1] - if "uid" in tm2_bench[i]["athlt"].keys() - else "" - ) - for i in range(len(tm2_bench)) - ] - tm2_bn_nm = [ - ( - tm2_bench[i]["athlt"]["shrtNm"] - if "shrtNm" in tm2_bench[i]["athlt"].keys() - else "" - ) - for i in range(len(tm2_bench)) - ] - tm2_bn_df = pd.DataFrame(tm2_bn_dict) - tm2_bn_df.insert(0, "starter", False) - tm2_bn_df.insert(0, "position", tm2_bn_pos) - tm2_bn_df.insert(0, "player_id", tm2_bn_id) - tm2_bn_df.insert(0, "player", tm2_bn_nm) - tm2_bn_df.insert(0, "team", tm2_name) - tm2_bn_df.insert(0, "game_id", game_id) +def _build_team_totals(totals, team_name, game_id, labels): + cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ + x.lower() for x in labels + ] + if len(totals) == 0: + return pd.DataFrame(columns=cols) + + tot_dict = {labels[i].lower(): [totals[i]] for i in range(len(labels))} + df = pd.DataFrame(tot_dict) + df.insert(0, "starter", False) + df.insert(0, "position", "TOTAL") + df.insert(0, "player_id", "TOTAL") + df.insert(0, "player", "TEAM") + df.insert(0, "team", team_name) + df.insert(0, "game_id", game_id) + return df - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm2_bn_df = pd.DataFrame(columns=cols) - # team totals - if len(tm2_totals) > 0: - tm2_tot_dict = {labels[i].lower(): [tm2_totals[i]] for i in range(len(labels))} +def _build_full_team_boxscore(stats, team_name, game_id, labels): + starters = stats[0]["athlts"] if len(stats) > 0 else [] + bench = stats[1]["athlts"] if len(stats) > 1 else [] + totals = stats[2]["ttls"] if len(stats) > 2 and "ttls" in stats[2] else [] + return pd.concat([ + _build_team_boxscore(starters, team_name, game_id, labels, True), + _build_team_boxscore(bench, team_name, game_id, labels, False), + _build_team_totals(totals, team_name, game_id, labels), + ]) - tm2_tot_df = pd.DataFrame(tm2_tot_dict) - tm2_tot_df.insert(0, "starter", False) - tm2_tot_df.insert(0, "position", "TOTAL") - tm2_tot_df.insert(0, "player_id", "TOTAL") - tm2_tot_df.insert(0, "player", "TEAM") - tm2_tot_df.insert(0, "team", tm2_name) - tm2_tot_df.insert(0, "game_id", game_id) - else: - cols = ["starter", "position", "player_id", "player", "team", "game_id"] + [ - x.lower() for x in labels - ] - tm2_tot_df = pd.DataFrame(columns=cols) +def _get_game_boxscore_helper(boxscore, game_id): + tm1, tm2 = boxscore[0], boxscore[1] + tm1_name, tm2_name = tm1["tm"]["dspNm"], tm2["tm"]["dspNm"] + labels = tm1["stats"][0]["lbls"] - tm2_df = pd.concat([tm2_st_df, tm2_bn_df, tm2_tot_df]) + tm1_df = _build_full_team_boxscore(tm1["stats"], tm1_name, game_id, labels) + tm2_df = _build_full_team_boxscore(tm2["stats"], tm2_name, game_id, labels) df = pd.concat([tm1_df, tm2_df]) @@ -998,12 +827,17 @@ def _get_game_boxscore_helper(boxscore, game_id): return pd.DataFrame([]) # SPLIT UP THE FG FIELDS - fgm = pd.to_numeric([x.split("-")[0] for x in df["fg"]], errors="coerce") - fga = pd.to_numeric([x.split("-")[1] for x in df["fg"]], errors="coerce") - thpm = pd.to_numeric([x.split("-")[0] for x in df["3pt"]], errors="coerce") - thpa = pd.to_numeric([x.split("-")[1] for x in df["3pt"]], errors="coerce") - ftm = pd.to_numeric([x.split("-")[0] for x in df["ft"]], errors="coerce") - fta = pd.to_numeric([x.split("-")[1] for x in df["ft"]], errors="coerce") + def _split_stat(series, idx): + return pd.to_numeric( + [x.split("-")[idx] if "-" in x else np.nan for x in series], + errors="coerce" + ) + fgm = _split_stat(df["fg"], 0) + fga = _split_stat(df["fg"], 1) + thpm = _split_stat(df["3pt"], 0) + thpa = _split_stat(df["3pt"], 1) + ftm = _split_stat(df["ft"], 0) + fta = _split_stat(df["ft"], 1) # GET RID OF UNWANTED COLUMNS df = df.drop(columns=["fg", "3pt", "ft"]) @@ -1057,22 +891,23 @@ def _get_game_pbp_helper(gamepackage, game_id, game_type): for x in all_plays ] hscores = [ - int(x["homeScore"]) if "homeScore" in x.keys() else np.nan for x in all_plays + pd.to_numeric(x.get("homeScore"), errors="coerce") for x in all_plays ] ascores = [ - int(x["awayScore"]) if "awayScore" in x.keys() else np.nan for x in all_plays + pd.to_numeric(x.get("awayScore"), errors="coerce") for x in all_plays ] periods = [ - int(x["period"]["number"]) if "period" in x.keys() else np.nan + pd.to_numeric(x["period"]["number"], errors="coerce") if "period" in x.keys() else np.nan for x in all_plays ] time_splits = [ - x["clock"]["displayValue"].split(":") if "clock" in x.keys() else "" + x["clock"]["displayValue"].split(":") if "clock" in x.keys() else ["0", "0"] for x in all_plays ] - minutes = [int(x[0]) for x in time_splits] - seconds = [int(x[1]) for x in time_splits] + time_splits = [ts if len(ts) >= 2 else [ts[0], "0"] for ts in time_splits] + minutes = [pd.to_numeric(x[0], errors="coerce") or 0 for x in time_splits] + seconds = [pd.to_numeric(x[1], errors="coerce") or 0 for x in time_splits] min_to_sec = [x * 60 for x in minutes] pd_secs_left = [x + y for x, y in zip(min_to_sec, seconds)] @@ -1310,7 +1145,8 @@ def _get_game_info_helper(gamepackage, game_id, game_type): at_info["records"][0]["displayValue"] if len(at_info["records"]) > 0 else "" ) - home_score, away_score = int(ht_info.get("score", 0)), int(at_info.get("score", 0)) + home_score = pd.to_numeric(ht_info.get("score", 0), errors="coerce") or 0 + away_score = pd.to_numeric(at_info.get("score", 0), errors="coerce") or 0 home_win = True if home_score > away_score and gm_status == 'Final' else False @@ -1343,7 +1179,7 @@ def _get_game_info_helper(gamepackage, game_id, game_type): try: home_spread = gamepackage['gameOdds']['odds'][-1]['pointSpread']['primary'] - except: + except (KeyError, IndexError, TypeError): home_spread = '' game_info_list = [ @@ -1463,24 +1299,27 @@ def _get_schedule_helper(jsn, team, id_, season): # get info from each game for ev in tot_events: - mat = re.search(r'gameId/(\d+)/', ev['time']['link']) + time_info = ev.get('time', {}) + mat = re.search(r'gameId/(\d+)/', time_info.get('link', '')) game_id = mat.group(1) if mat is not None else '' date = parser.parse(ev['date']['date']).astimezone(tz('America/Los_Angeles')) day = date.strftime('%B %d, %Y') time = date.strftime('%I:%M %p %Z') - opp = ev['opponent']['displayName'] - opp_id = ev['opponent']['id'] + opp_info = ev.get('opponent', {}) + opp = opp_info.get('displayName', '') + opp_id = opp_info.get('id', '') - network = ev['network'][0]['name'] if len(ev['network']) > 0 else '' - season_type = ev['seasonType']['name'] - status = ev['status']['description'] + network_list = ev.get('network', []) + network = network_list[0]['name'] if len(network_list) > 0 else '' + season_type = ev.get('seasonType', {}).get('name', '') + status = ev.get('status', {}).get('description', '') - res = ev['result'] + res = ev.get('result', {}) - if status == 'Final': - result = res['winLossSymbol'] + ' ' + res['currentTeamScore'] + '-' + res['opponentTeamScore'] + if status == 'Final' and res: + result = res.get('winLossSymbol', '') + ' ' + res.get('currentTeamScore', '') + '-' + res.get('opponentTeamScore', '') else: result = 'N/A' @@ -1511,9 +1350,13 @@ def _get_schedule_helper(jsn, team, id_, season): return df.reset_index(drop=True) +_team_map_cache = {} + def _get_team_map(game_type): - data_path = Path(__file__).parent / f'{game_type}_team_map.csv' - return pd.read_csv(data_path) + if game_type not in _team_map_cache: + data_path = Path(__file__).parent / f'{game_type}_team_map.csv' + _team_map_cache[game_type] = pd.read_csv(data_path) + return _team_map_cache[game_type] def _get_id_from_team(team, season, game_type): @@ -1587,63 +1430,44 @@ def _get_teams_from_conference(conference, season, game_type): return rel_team_df.location.tolist() -def _get_json_from_soup(soup): +def _parse_espn_json(soup): script_string = _find_json_in_content(soup) if script_string == "": return None pattern = re.compile(JSON_REGEX) - found = re.search(pattern, script_string).group(1) - js = "{" + found + "}" - jsn = json.loads(js) - - return jsn - - -def _get_gamepackage_from_soup(soup): - script_string = _find_json_in_content(soup) - - if script_string == "": + match = re.search(pattern, script_string) + if match is None: return None - pattern = re.compile(JSON_REGEX) - found = re.search(pattern, script_string).group(1) - js = "{" + found + "}" - jsn = json.loads(js) - gamepackage = jsn["page"]["content"]["gamepackage"] + js = "{" + match.group(1) + "}" + return json.loads(js) - return gamepackage +def _get_json_from_soup(soup): + return _parse_espn_json(soup) -def _get_player_from_soup(soup): - script_string = _find_json_in_content(soup) - if script_string == "": +def _get_gamepackage_from_soup(soup): + jsn = _parse_espn_json(soup) + if jsn is None: return None + return jsn["page"]["content"]["gamepackage"] - pattern = re.compile(JSON_REGEX) - found = re.search(pattern, script_string).group(1) - js = "{" + found + "}" - jsn = json.loads(js) - player = jsn["page"]["content"]["player"] - return player +def _get_player_from_soup(soup): + jsn = _parse_espn_json(soup) + if jsn is None: + return None + return jsn["page"]["content"]["player"] def _get_scoreboard_from_soup(soup): - script_string = _find_json_in_content(soup) - - if script_string == "": + jsn = _parse_espn_json(soup) + if jsn is None: return None - - pattern = re.compile(JSON_REGEX) - found = re.search(pattern, script_string).group(1) - js = "{" + found + "}" - jsn = json.loads(js) - scoreboard = jsn["page"]["content"]["scoreboard"]["evts"] - - return scoreboard + return jsn["page"]["content"]["scoreboard"]["evts"] def _find_json_in_content(soup): diff --git a/src/cbbpy/utils/mens_team_map.csv b/src/cbbpy/utils/mens_team_map.csv index 4e9a54e..62d2a5b 100644 --- a/src/cbbpy/utils/mens_team_map.csv +++ b/src/cbbpy/utils/mens_team_map.csv @@ -7978,3 +7978,4 @@ season,id,team,location,conference,conference_abb 2025,250,UT Arlington Mavericks,UT Arlington,Western Athletic Conference,wac 2025,3101,Utah Tech Trailblazers,Utah Tech,Western Athletic Conference,wac 2025,3084,Utah Valley Wolverines,Utah Valley,Western Athletic Conference,wac +2026,2441,New Haven Chargers,New Haven,Northeast Conference,neast diff --git a/src/cbbpy/utils/womens_team_map.csv b/src/cbbpy/utils/womens_team_map.csv index 56fe97c..753fa41 100644 --- a/src/cbbpy/utils/womens_team_map.csv +++ b/src/cbbpy/utils/womens_team_map.csv @@ -5605,3 +5605,4 @@ season,id,team,location,conference,conference_abb 2025,250,UT Arlington Mavericks,UT Arlington,Western Athletic Conference,wac 2025,3101,Utah Tech Trailblazers,Utah Tech,Western Athletic Conference,wac 2025,3084,Utah Valley Wolverines,Utah Valley,Western Athletic Conference,wac +2026,2441,New Haven Chargers,New Haven,Northeast Conference,neast diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..17c5b59 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,210 @@ +import pytest +import pandas as pd +from datetime import datetime +from cbbpy.utils.cbbpy_utils import ( + _parse_date, + _get_current_season, + _get_team_map, + _get_id_from_team, + _get_teams_from_conference, + _get_season_conferences, + _build_team_boxscore, + _build_team_totals, + _build_full_team_boxscore, + _extract_player_id, + CouldNotParseError, +) + + +# --- _parse_date --- + +class TestParseDate: + def test_ymd_dash(self): + assert _parse_date("2024-03-15") == datetime(2024, 3, 15) + + def test_ymd_slash(self): + assert _parse_date("2024/03/15") == datetime(2024, 3, 15) + + def test_mdy_dash(self): + assert _parse_date("03-15-2024") == datetime(2024, 3, 15) + + def test_mdy_slash(self): + assert _parse_date("03/15/2024") == datetime(2024, 3, 15) + + def test_invalid_raises(self): + with pytest.raises(CouldNotParseError): + _parse_date("not-a-date") + + def test_empty_raises(self): + with pytest.raises(CouldNotParseError): + _parse_date("") + + +# --- _get_current_season --- + +class TestGetCurrentSeason: + def test_returns_int(self): + result = _get_current_season() + assert isinstance(result, int) + + def test_matches_expected_season(self): + now = datetime.today() + expected = now.year + (1 if now.month >= 10 else 0) + assert _get_current_season() == expected + + +# --- _get_team_map --- + +class TestGetTeamMap: + def test_mens_returns_dataframe(self): + df = _get_team_map("mens") + assert isinstance(df, pd.DataFrame) + assert len(df) > 0 + + def test_womens_returns_dataframe(self): + df = _get_team_map("womens") + assert isinstance(df, pd.DataFrame) + assert len(df) > 0 + + def test_has_expected_columns(self): + df = _get_team_map("mens") + expected_cols = {"season", "id", "team", "location", "conference", "conference_abb"} + assert expected_cols.issubset(set(df.columns)) + + def test_caching(self): + df1 = _get_team_map("mens") + df2 = _get_team_map("mens") + assert df1 is df2 + + +# --- _get_id_from_team --- + +class TestGetIdFromTeam: + def test_exact_match(self): + id_, name = _get_id_from_team("Duke", 2023, "mens") + assert name == "Duke" + assert id_ is not None + + def test_fuzzy_match(self): + id_, name = _get_id_from_team("Duk", 2023, "mens") + assert name == "Duke" + + def test_womens(self): + id_, name = _get_id_from_team("UConn", 2023, "womens") + assert id_ is not None + + +# --- _get_season_conferences --- + +class TestGetSeasonConferences: + def test_returns_conferences(self): + df = _get_season_conferences(2023, "mens") + assert isinstance(df, pd.DataFrame) + assert len(df) > 0 + assert "conference" in df.columns + assert "conference_abb" in df.columns + + def test_known_conference_present(self): + df = _get_season_conferences(2023, "mens") + assert "Atlantic Coast Conference" in df.conference.values + + +# --- _get_teams_from_conference --- + +class TestGetTeamsFromConference: + def test_returns_team_list(self): + teams = _get_teams_from_conference("acc", 2023, "mens") + assert isinstance(teams, list) + assert len(teams) > 0 + + def test_known_team_in_conference(self): + teams = _get_teams_from_conference("acc", 2023, "mens") + assert "Duke" in teams + + def test_exact_abbreviation(self): + teams = _get_teams_from_conference("acc", 2023, "mens") + assert "Duke" in teams + + def test_fuzzy_conference_match(self): + teams = _get_teams_from_conference("atlantic coast", 2023, "mens") + assert len(teams) > 0 + + def test_womens(self): + teams = _get_teams_from_conference("big ten", 2023, "womens") + assert isinstance(teams, list) + assert len(teams) > 0 + + +# --- boxscore helpers --- + +class TestBuildTeamBoxscore: + LABELS = ["MIN", "FG", "3PT", "FT", "OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF", "PTS"] + + def test_empty_players(self): + df = _build_team_boxscore([], "TestTeam", "12345", self.LABELS, True) + assert isinstance(df, pd.DataFrame) + assert len(df) == 0 + + def test_single_player(self): + players = [{ + "athlt": {"pos": "G", "uid": "s:40~l:41~a:999", "shrtNm": "J. Doe"}, + "stats": ["30", "5-10", "2-4", "3-3", "1", "4", "5", "3", "1", "0", "2", "2", "15"], + }] + df = _build_team_boxscore(players, "TestTeam", "12345", self.LABELS, True) + assert len(df) == 1 + assert df.iloc[0]["player"] == "J. Doe" + assert df.iloc[0]["team"] == "TestTeam" + assert bool(df.iloc[0]["starter"]) is True + assert df.iloc[0]["position"] == "G" + assert df.iloc[0]["player_id"] == "999" + + def test_missing_athlete_fields(self): + players = [{"athlt": {}, "stats": ["0"] * 13}] + df = _build_team_boxscore(players, "Team", "123", self.LABELS, False) + assert len(df) == 1 + assert df.iloc[0]["player"] == "" + assert df.iloc[0]["position"] == "" + + def test_totals_empty(self): + df = _build_team_totals([], "Team", "123", self.LABELS) + assert len(df) == 0 + + def test_totals(self): + totals = ["200", "30-60", "10-20", "15-18", "8", "22", "30", "15", "5", "3", "10", "15", "85"] + df = _build_team_totals(totals, "Team", "123", self.LABELS) + assert len(df) == 1 + assert df.iloc[0]["player"] == "TEAM" + assert df.iloc[0]["position"] == "TOTAL" + + def test_full_team_boxscore(self): + player = { + "athlt": {"pos": "F", "uid": "s:40~l:41~a:111", "shrtNm": "A. Player"}, + "stats": ["20"] + ["0-0"] * 3 + ["0"] * 9, + } + stats = [ + {"athlts": [player], "lbls": self.LABELS}, + {"athlts": [], "lbls": self.LABELS}, + {"ttls": ["100"] + ["0-0"] * 3 + ["0"] * 9}, + ] + df = _build_full_team_boxscore(stats, "Team", "123", self.LABELS) + assert len(df) == 2 # 1 starter + 0 bench + 1 total + + def test_full_team_boxscore_missing_sections(self): + df = _build_full_team_boxscore([], "Team", "123", self.LABELS) + assert len(df) == 0 + + +# --- _extract_player_id --- + +class TestExtractPlayerId: + def test_standard_uid(self): + assert _extract_player_id("s:40~l:41~a:999") == "999" + + def test_missing_uid(self): + assert _extract_player_id("") == "" + + def test_malformed_uid_no_athlete(self): + assert _extract_player_id("s:40~l:41") == "" + + def test_uid_single_segment(self): + assert _extract_player_id("s:40") == ""