Skip to content

Commit

Permalink
feat: 🌱 Seed Command
Browse files Browse the repository at this point in the history
commit 218ce70
Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com>
Date:   Thu Mar 7 18:11:19 2024 -0500

    feat: 🌱 created a seed command

    generates 3 new leaderboard documents

commit 200565e
Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com>
Date:   Thu Mar 7 18:05:39 2024 -0500

    feat: ✨ modified commands using subparsers

commit e3a0ff5
Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com>
Date:   Thu Mar 7 15:38:26 2024 -0500

    style: 🎨 Formatted Code

commit 970b95a
Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com>
Date:   Thu Mar 7 15:19:50 2024 -0500

    refactor: ♻️ updated magic constants in `shared`

    Constants: `MAX_NUM_OF_TEAMS`, `DECIMALS`, `ROOT_FOLDER_PATH`

commit 957213d
Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com>
Date:   Thu Mar 7 15:03:20 2024 -0500

    perf: ⚡️ reduced pandas imports

commit 86453b6
Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com>
Date:   Thu Mar 7 14:51:07 2024 -0500

    docs: 📝 updated instructions and commands in README

    added `--dev` to install develop packages

commit 7dab886
Author: Shawn Santhoshgeorge <32755895+ShawnGeorge03@users.noreply.github.com>
Date:   Thu Mar 7 14:40:50 2024 -0500

    chore: ➕ installed `pyarrow` as pandas depenceny

    `pyarrow` will be a future dependency for pandas: pandas-dev/pandas#54466
  • Loading branch information
ShawnGeorge03 committed Mar 7, 2024
1 parent df2643d commit 1871492
Show file tree
Hide file tree
Showing 6 changed files with 354 additions and 188 deletions.
1 change: 1 addition & 0 deletions Pipfile
Expand Up @@ -12,6 +12,7 @@ pymongo = "*"

[dev-packages]
tabulate = "*"
pyarrow = "*"

[requires]
python_version = "3.12"
Expand Down
337 changes: 211 additions & 126 deletions Pipfile.lock

Large diffs are not rendered by default.

16 changes: 9 additions & 7 deletions README.md
Expand Up @@ -12,7 +12,7 @@ Creates a server to update DS3 Datathon Leaderboard using Kaggle Leaderboard
Read [below](#custom-kaggle-api)

```bash
pipenv install
pipenv install --dev
```

2. Copy the `.env.example` to `.env`:
Expand All @@ -33,12 +33,14 @@ Creates a server to update DS3 Datathon Leaderboard using Kaggle Leaderboard

All commands are run from the root of the project, from a terminal:

| Command | Action |
| -------------------- | ---------------------------------------------- |
| `pipenv install` | Installs packages to virtual environment |
| `pipenv run dev` | Starts local dev server at `0.0.0.0:8000` |
| `pipenv run start` | Creates a production server at `0.0.0.0:8000` |
| Command | Action |
| ---------------------- | ----------------------------------------------|
| `pipenv install` | Installs default packages |
| `pipenv install --dev` | Installs develop and default packages |
| `pipenv run dev` | Starts local dev server at `0.0.0.0:8000` |
| `pipenv run start` | Creates a production server at `0.0.0.0:8000` |

Check the [`pipenv`](https://pipenv.pypa.io/en/latest/cli.html) docs for more commands

Checkout the [Wiki](https://github.com/theDS3/Datathon-Leaderboard/wiki) for details about build images and other topics
Checkout the [Wiki](https://github.com/theDS3/Datathon-Leaderboard/wiki) for
details about build images and other topics
132 changes: 103 additions & 29 deletions src/leaderboard.py
Expand Up @@ -4,8 +4,13 @@
from dotenv import dotenv_values
from tabulate import tabulate

from numpy.random import randint, uniform
from pandas import DataFrame

from shared import (
ROOT_PATH,
ROOT_FOLDER_PATH,
MAX_NUM_OF_TEAMS,
DECIMALS,
get_abs_file_paths,
process_csv,
process_competitions,
Expand All @@ -21,53 +26,122 @@ class Secrets(TypedDict):
config: Secrets = dotenv_values(".env")

if len(config) == 0:
raise RuntimeError('Missing .env file')
raise RuntimeError("Missing .env file")

parser = ArgumentParser(
prog="leaderboard.py",
description="Create Private & Final Leaderboard Entry",
description="Add Leaderboard Entry",
epilog="Update Datathon Leaderboard",
)

parser.add_argument(
subparsers = parser.add_subparsers(dest="command")

# Seed command
seed_parser = subparsers.add_parser("seed", help="Generate Mock Standings")
seed_parser.add_argument(
"num_of_teams",
metavar="teams-size",
type=int,
nargs="?",
default=MAX_NUM_OF_TEAMS,
help=f"Seed value (default: {MAX_NUM_OF_TEAMS})",
)

# Show command
show_parser = subparsers.add_parser("show", help="Display the latest Standing")
show_parser.add_argument(
"type",
metavar="type",
type=str,
choices={"public", "private", "final"},
help="Leaderboard Entry Type",
choices={"private", "final"},
)

parser.add_argument(
"-p",
"--publish",
help="Publish the latest standing",
action="store_true",
default=False,
# Publish command
publish_parser = subparsers.add_parser("publish", help="Publish the latest Standing")
publish_parser.add_argument(
"type",
metavar="type",
type=str,
choices={"public", "private", "final"},
help="Leaderboard Entry Type",
)

args = parser.parse_args()

PRIVATE_PATH = f"{ROOT_PATH}/private"

client = MongoClient(config.get("MONGO_URI"))
db = client.get_database(config.get("MONGO_DB"))
leaderboard_col = db.get_collection("leaderboard")

csv_files = get_abs_file_paths(PRIVATE_PATH, ".csv")
df = process_csv(csv_files)
standings = process_competitions(df, args.type, leaderboard_col)

if len(standings) == 0:
raise ValueError(f"Review the CSV files in {PRIVATE_PATH}")


print(f"Latest Standing for {args.type.capitalize()} Leaderboard")
headers = [x.capitalize() for x in standings[0].keys()]
rows = [x.values() for x in standings]
print(tabulate(rows, headers, tablefmt="mixed_outline", numalign='left'))

if args.publish:
update_leaderboard(standings, args.type, leaderboard_col)
print(f"Updated {args.type.capitalize()} Leaderboard on {db} Database")
if args.command == "seed":
name = list(map(lambda x: f"Team-{x}", range(1, args.num_of_teams + 1)))
attempts = randint(low=1, high=40, size=args.num_of_teams)

score = lambda high=100: uniform(low=0, high=high, size=(args.num_of_teams, 3)).sum(
axis=1
)
delta = lambda: list(
map(
lambda x: "-" if x == 0 else f"+{x}" if x > 0 else x,
randint(-12, 12, size=args.num_of_teams),
)
)

for leaderboard_type in ["public", "private", "final"]:
df = DataFrame(
{
"name": name,
"score": score(100 if leaderboard_type != "final" else 94),
"attempts": attempts,
"delta": delta(),
}
)

if leaderboard_type == "final":
df = df.assign(
bonus=uniform(0.1, 2.0, size=args.num_of_teams)
** randint(1, 4, size=args.num_of_teams)
)
df = df.assign(finalScore=df["bonus"] * df["score"])

df.sort_values(
by=["finalScore", "score", "attempts"],
inplace=True,
ascending=[False, False, True],
)
df["bonus"] = df["bonus"].astype(float).round(DECIMALS)
df["score"] = df["score"].astype(float).round(DECIMALS)
df["finalScore"] = df["finalScore"].astype(float).round(DECIMALS)
else:
df.sort_values(
by=["score", "attempts"], inplace=True, ascending=[False, True]
)
df["score"] = df["score"].astype(float).round(DECIMALS)

update_leaderboard(df.to_dict("records"), leaderboard_type, leaderboard_col)
print(f"Added Mock Entry for {leaderboard_type.capitalize()} Leaderboard on {db.name} Database")

elif args.command in ['show', 'publish']:

PRIVATE_PATH = f"{ROOT_FOLDER_PATH}/private"

csv_files = get_abs_file_paths(PRIVATE_PATH, ".csv")
df = process_csv(csv_files)
standings = process_competitions(df, args.type, leaderboard_col)

if len(standings) == 0:
if args.type != 'public':
raise ValueError(f"Review the CSV files in {PRIVATE_PATH}")
else:
raise ValueError(f"Review the Competition on Kaggle")

print(f"Latest Standing for {args.type.capitalize()} Leaderboard")
headers = [x.capitalize() for x in standings[0].keys()]
rows = [x.values() for x in standings]
print(tabulate(rows, headers, tablefmt="mixed_outline", numalign='left'))

if args.command == 'publish':
update_leaderboard(standings, args.type, leaderboard_col)
print(f"Updated {args.type.capitalize()} Leaderboard on {db.name} Database")

client.close()
8 changes: 4 additions & 4 deletions src/main.py
Expand Up @@ -10,8 +10,8 @@
from pydantic import BaseModel

from src.shared import (
ROOT_PATH,
MAX_TEAMS,
ROOT_FOLDER_PATH,
MAX_NUM_OF_TEAMS,
get_abs_file_paths,
process_csv,
process_competitions,
Expand All @@ -20,12 +20,12 @@

class Request(BaseModel):
competitions: list[str]
numOfTeams: int = MAX_TEAMS
numOfTeams: int = MAX_NUM_OF_TEAMS


app = FastAPI()

PUBLIC_PATH = f"{ROOT_PATH}/public"
PUBLIC_PATH = f"{ROOT_FOLDER_PATH}/public"

def download_competitions(competitions: list[str]) -> list[str]:
"""Downloads and Extracts leaderboard information from multiple competitions
Expand Down
48 changes: 26 additions & 22 deletions src/shared.py
@@ -1,11 +1,11 @@
from typing import TypedDict, NotRequired, Literal, Dict
from typing import TypedDict, NotRequired, Literal
from pymongo.collection import Collection

from datetime import datetime
from pytz import timezone
from pandas import DataFrame, read_csv, merge

import os
import pandas as pd

type LeaderboardType = Literal["public", "private", "final"]

Expand All @@ -20,9 +20,13 @@ class LeaderboardEntry(TypedDict):


# Maximum number of teams to be displayed
MAX_TEAMS = 40
MAX_NUM_OF_TEAMS = 40

ROOT_PATH = "data"
# Maximum decimal places for rounding
DECIMALS = 5

# Root Folder
ROOT_FOLDER_PATH = "data"


def get_abs_file_paths(
Expand Down Expand Up @@ -62,7 +66,7 @@ def get_abs_file_paths(
return files if not is_sorted else sorted(files)


def process_csv(csv_file_paths: list[str]) -> pd.DataFrame:
def process_csv(csv_file_paths: list[str]) -> DataFrame:
"""Extract leaderboard information from *.csv
Args:
Expand All @@ -72,13 +76,13 @@ def process_csv(csv_file_paths: list[str]) -> pd.DataFrame:
ValueError: `InvalidFile`
Returns:
pd.DataFrame: Kaggle API Data with Scores and Counts for each Team from all competitions
DataFrame: Kaggle API Data with Scores and Counts for each Team from all competitions
"""

df = pd.DataFrame()
df = DataFrame()
for csv_file_path in csv_file_paths:
# Read the CSV files and filter specific columns
df_comp = pd.read_csv(csv_file_path).filter(
df_comp = read_csv(csv_file_path).filter(
["TeamName", "Score", "SubmissionCount"]
)

Expand All @@ -95,24 +99,24 @@ def process_csv(csv_file_paths: list[str]) -> pd.DataFrame:
]

# Join the multiple csv from
df = df_comp if df.empty else pd.merge(df, df_comp, on=["name"], how="outer")
df = df_comp if df.empty else merge(df, df_comp, on=["name"], how="outer")

return df


def process_competitions(
df: pd.DataFrame,
df: DataFrame,
leaderboard_type: LeaderboardType,
coll: Collection,
size=MAX_TEAMS,
size=MAX_NUM_OF_TEAMS,
) -> list[LeaderboardEntry]:
"""Process the leaderboard data from multiple competitions
Args:
df (pd.DataFrame): Kaggle API Data with Scores and Counts for each Team from all competitions.
df (DataFrame): Kaggle API Data with Scores and Counts for each Team from all competitions.
leaderboard_type (LeaderboardType): Specific mode of calculations
coll (Collection): Collection that contains the leaderboard snapshots.
size (int, optional): Limits number of teams. Defaults to `MAX_TEAMS`.
size (int, optional): Limits number of teams. Defaults to `MAX_NUM_OF_TEAMS`.
Raises:
ValueError: `InvalidCollection`
Expand Down Expand Up @@ -166,14 +170,14 @@ def process_competitions(
df["bonus"] = 1.0
df.set_index("name", inplace=True)

MAPPING_CSV = f"{ROOT_PATH}/final/mapping.csv"
BONUS_PATH = f"{ROOT_PATH}/final/bonus"
MAPPING_CSV = f"{ROOT_FOLDER_PATH}/final/mapping.csv"
BONUS_PATH = f"{ROOT_FOLDER_PATH}/final/bonus"

if not os.path.isfile(MAPPING_CSV):
raise ValueError(f"FileDoesNotExist: {MAPPING_CSV}")

# Get mapping
df_mapping = pd.read_csv(MAPPING_CSV)
df_mapping = read_csv(MAPPING_CSV)

# Get a list of all CSV files in the folder
csv_files = get_abs_file_paths(BONUS_PATH, ".csv")
Expand All @@ -185,10 +189,10 @@ def process_competitions(
for csv_file in csv_files:

# Load the CSV file into a dataframe
df_attendance = pd.read_csv(csv_file)
df_attendance = read_csv(csv_file)

# Subsetting attendance
merged_df = pd.merge(df_mapping, df_attendance, on="Email", how="inner")
merged_df = merge(df_mapping, df_attendance, on="Email", how="inner")

# Populating with 1's
merged_df["Attendance"] = 1
Expand Down Expand Up @@ -221,9 +225,9 @@ def process_competitions(
df.reset_index(inplace=True, drop=False)

# round
df["bonus"] = df["bonus"].astype(float).round(5)
df["score"] = df["score"].astype(float).round(5)
df["finalScore"] = df["finalScore"].astype(float).round(5)
df["bonus"] = df["bonus"].astype(float).round(DECIMALS)
df["score"] = df["score"].astype(float).round(DECIMALS)
df["finalScore"] = df["finalScore"].astype(float).round(DECIMALS)

compare_leaderboard_type = "private"
curr_standings = df[:size][
Expand All @@ -236,7 +240,7 @@ def process_competitions(
df.reset_index(inplace=True, drop=True)

# round
df["score"] = df["score"].astype(float).round(5)
df["score"] = df["score"].astype(float).round(DECIMALS)

curr_standings = df[:size][["name", "score", "attempts"]].to_dict("records")

Expand Down

0 comments on commit 1871492

Please sign in to comment.