feat: initial dataloading

finaized the data grabbing, sampling, and loading code, along with defining the schema for ft
This commit is contained in:
2026-04-07 13:22:50 +02:00
parent d418c44437
commit 6dc0486309
5 changed files with 374 additions and 75 deletions
-74
View File
@@ -1,74 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "05c0de49-0eb8-4cd2-89b2-c312111dda85",
"metadata": {},
"outputs": [],
"source": [
"%pip install redis hiredis fastid faker tqdm"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "90326242-f141-4035-9053-d3aab6cc9224",
"metadata": {},
"outputs": [],
"source": [
"from redis.cluster import RedisCluster\n",
"\n",
"client = RedisCluster(host=\"redis\", port=6379)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "50b9f473-8924-4d03-acf8-71ecf25e54a8",
"metadata": {},
"outputs": [],
"source": [
"from fastid import ulid\n",
"from faker import Faker\n",
"from tqdm.notebook import trange\n",
"faker = Faker()\n",
"\n",
"with client.pipeline() as p:\n",
" for _ in trange(15_000_000):\n",
" p.hset(f\"profile:{ulid()}\", mapping={\"name\": \"John Doe\", \"sex\": \"M\"})\n",
" p.execute()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "23933aa3-ff5d-45f9-b402-59e58c02a2b3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+133
View File
@@ -0,0 +1,133 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "05c0de49-0eb8-4cd2-89b2-c312111dda85",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: redis in /opt/conda/lib/python3.13/site-packages (7.4.0)\n",
"Requirement already satisfied: hiredis in /opt/conda/lib/python3.13/site-packages (3.3.1)\n",
"Requirement already satisfied: fastid in /opt/conda/lib/python3.13/site-packages (0.0.5)\n",
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.13/site-packages (4.67.3)\n",
"Requirement already satisfied: kagglehub in /opt/conda/lib/python3.13/site-packages (1.0.0)\n",
"Requirement already satisfied: kagglesdk<1.0,>=0.1.14 in /opt/conda/lib/python3.13/site-packages (from kagglehub) (0.1.16)\n",
"Requirement already satisfied: packaging in /opt/conda/lib/python3.13/site-packages (from kagglehub) (26.0)\n",
"Requirement already satisfied: pyyaml in /opt/conda/lib/python3.13/site-packages (from kagglehub) (6.0.3)\n",
"Requirement already satisfied: requests in /opt/conda/lib/python3.13/site-packages (from kagglehub) (2.32.5)\n",
"Requirement already satisfied: protobuf in /opt/conda/lib/python3.13/site-packages (from kagglesdk<1.0,>=0.1.14->kagglehub) (6.33.5)\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.4.6)\n",
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.11)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2.6.3)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2026.2.25)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install redis hiredis fastid tqdm kagglehub"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "90326242-f141-4035-9053-d3aab6cc9224",
"metadata": {},
"outputs": [],
"source": [
"from redis.cluster import RedisCluster\n",
"from redis.commands.search.field import TextField, NumericField, TagField\n",
"from redis.commands.search.index_definition import IndexDefinition, IndexType\n",
"\n",
"\n",
"client = RedisCluster(host=\"redis\", port=6379, username=\"admin\", password=\"admin\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "50b9f473-8924-4d03-acf8-71ecf25e54a8",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "887890574a2048a587b6ed4dd8eef7c6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1000000 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from fastid import ulid\n",
"from tqdm.notebook import trange\n",
"\n",
"with client.pipeline() as p:\n",
" for _ in trange(1_000_000):\n",
" p.hset(f\"profile:{ulid()}\", mapping={\"name\": \"John Doe\", \"sex\": \"M\"})\n",
" p.execute()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "23933aa3-ff5d-45f9-b402-59e58c02a2b3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"b'OK'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"schema = (\n",
" TextField(\"name\"),\n",
" TagField(\"sex\")\n",
")\n",
"\n",
"created_idx = client.ft(\"idx:profile\").create_index(\n",
" schema,\n",
" definition=IndexDefinition(\n",
" prefix=[\"profile:\"], index_type=IndexType.HASH\n",
" )\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+240
View File
@@ -0,0 +1,240 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "071f6969-c01a-4369-a763-871f5b9e65b3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: redis in /opt/conda/lib/python3.13/site-packages (7.4.0)\n",
"Requirement already satisfied: hiredis in /opt/conda/lib/python3.13/site-packages (3.3.1)\n",
"Requirement already satisfied: fastid in /opt/conda/lib/python3.13/site-packages (0.0.5)\n",
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.13/site-packages (4.67.3)\n",
"Requirement already satisfied: kagglehub in /opt/conda/lib/python3.13/site-packages (1.0.0)\n",
"Requirement already satisfied: kagglesdk<1.0,>=0.1.14 in /opt/conda/lib/python3.13/site-packages (from kagglehub) (0.1.16)\n",
"Requirement already satisfied: packaging in /opt/conda/lib/python3.13/site-packages (from kagglehub) (26.0)\n",
"Requirement already satisfied: pyyaml in /opt/conda/lib/python3.13/site-packages (from kagglehub) (6.0.3)\n",
"Requirement already satisfied: requests in /opt/conda/lib/python3.13/site-packages (from kagglehub) (2.32.5)\n",
"Requirement already satisfied: protobuf in /opt/conda/lib/python3.13/site-packages (from kagglesdk<1.0,>=0.1.14->kagglehub) (6.33.5)\n",
"Requirement already satisfied: charset_normalizer<4,>=2 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.4.6)\n",
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.11)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2.6.3)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2026.2.25)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install redis hiredis fastid tqdm kagglehub"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "526cb932-59aa-489e-bbba-954ac645b633",
"metadata": {},
"outputs": [],
"source": [
"from redis.cluster import RedisCluster\n",
"from redis.cache import CacheConfig\n",
"from redis.retry import Retry\n",
"from redis.backoff import ExponentialBackoff\n",
"from pathlib import Path\n",
"import kagglehub\n",
"from csv import DictReader\n",
"from fastid import ulid\n",
"from tqdm.notebook import tqdm\n",
"\n",
"retry = Retry(ExponentialBackoff(), 8)\n",
"client = RedisCluster(\n",
" host=\"redis\",\n",
" port=6379,\n",
" username=\"admin\",\n",
" password=\"admin\",\n",
" retry=retry,\n",
" protocol=3,\n",
" cache_config=CacheConfig(),\n",
" decode_responses=True,\n",
" health_check_interval=3,\n",
" \n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8d0875b5-f004-44fc-9519-89fc73c42b77",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Path to dataset files: /home/jovyan/.cache/kagglehub/datasets/awesomizer/rainbox-six-siege-dataset/versions/1\n"
]
}
],
"source": [
"dataset = Path(kagglehub.dataset_download(\"awesomizer/rainbox-six-siege-dataset\"))\n",
"\n",
"print(\"Path to dataset files:\", dataset)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "be4c5461-1db2-4226-b71e-cdc7f06615bd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"full_dump = dataset/\"datadump_S5\"/\"datadump_S5.csv\"\n",
"objectives = dataset/\"datadump_S5_summary_objectives\"/\"datadump_S5_summary_objectives.csv\"\n",
"operator_loadouts = dataset/\"dataDump_s5_summary_operator_loadout\"/\"dataDump_s5_summary_operator_loadout.csv\"\n",
"display(full_dump.exists())\n",
"display(objectives.exists())\n",
"display(operator_loadouts.exists())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2efb5699-cced-4df0-9d76-278e23874436",
"metadata": {},
"outputs": [],
"source": [
"with full_dump.open(\"rb\") as f:\n",
" rows = sum(1 for line in f) - 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9177760b-eec4-4ab4-8242-0dd8df5db9dd",
"metadata": {},
"outputs": [],
"source": [
"daily_matches = {}\n",
"with full_dump.open(errors=\"ignore\") as csvfile:\n",
" reader = DictReader(csvfile, delimiter=\";\")\n",
" for i, row in enumerate(tqdm(reader, desc=\"Importing raw data into Redis...\", total=rows, unit=\"row\")):\n",
" if row[\"dateid\"] not in daily_matches: # There's a ton of data so we just select two matches per day\n",
" daily_matches[row[\"dateid\"]] = set((row[\"matchid\"],))\n",
" display(f\"Selected {row[\"matchid\"]} for day {row[\"dateid\"]}\")\n",
" if row[\"matchid\"] not in daily_matches[row[\"dateid\"]]: # Rows from a given day that aren't related to a specific match are ignored\n",
" if len(daily_matches[row[\"dateid\"]]) >= 2: # Skip only if we already have two matches picked\n",
" continue\n",
" daily_matches[row[\"dateid\"]].add(row[\"matchid\"])\n",
" display(f\"Selected {row[\"matchid\"]} for day {row[\"dateid\"]}\")\n",
" client.hset(f\"raw:{ulid()}\", mapping=row)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d8b75e35-8bef-4b2b-ae25-1982076c73fd",
"metadata": {},
"outputs": [],
"source": [
"from redis.commands.search.field import TagField, NumericField, TextField\n",
"from redis.commands.search.index_definition import IndexDefinition, IndexType\n",
"\n",
"schema = (\n",
" NumericField(\"dateid\", sortable=True, no_index=True),\n",
" TagField(\"platform\"),\n",
" TagField(\"gamemode\"),\n",
" TagField(\"mapname\"),\n",
" NumericField(\"matchid\"),\n",
" NumericField(\"roundnumber\", sortable=True, no_index=True),\n",
" TagField(\"objectivelocation\"),\n",
" TagField(\"winrole\"),\n",
" TagField(\"endroundreason\"),\n",
" NumericField(\"roundduration\", sortable=True, no_index=True),\n",
" NumericField(\"clearancelevel\", sortable=True, no_index=True),\n",
" TagField(\"skillrank\"),\n",
" TagField(\"role\"),\n",
" NumericField(\"team\"),\n",
" NumericField(\"haswon\"),\n",
" TagField(\"operator\"),\n",
" NumericField(\"nbkills\", sortable=True, no_index=True),\n",
" NumericField(\"isdead\"),\n",
" TagField(\"primaryweapon\"),\n",
" TagField(\"primaryweapontype\"),\n",
" TagField(\"primarysight\"),\n",
" TagField(\"primarygrip\"),\n",
" TagField(\"primaryunderbarrel\"),\n",
" TagField(\"primarybarrel\"),\n",
" TagField(\"secondaryweapon\"),\n",
" TagField(\"secondaryweapontype\"),\n",
" TagField(\"secondarysight\"),\n",
" TagField(\"secondarygrip\"),\n",
" TagField(\"secondaryunderbarrel\"),\n",
" TagField(\"secondarybarrel\"),\n",
" TagField(\"secondarygadget\")\n",
")\n",
"\n",
"client.ft().create_index(schema, definition=IndexDefinition(prefix=(\"raw:\",), index_type=IndexType.HASH))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "015ac04d-5f92-4323-8814-40ec35384a3c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}