From 6dc0486309c42a17589454462abf26bbc2f2a161 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radek=20Gol=C3=A1=C5=88=20jr?= Date: Tue, 7 Apr 2026 13:22:50 +0200 Subject: [PATCH] feat: initial dataloading finaized the data grabbing, sampling, and loading code, along with defining the schema for ft --- conf/redis.conf | 2 +- notebooks/Untitled.ipynb | 74 ------------ notebooks/fill_test.ipynb | 133 +++++++++++++++++++++ notebooks/r6data.ipynb | 240 ++++++++++++++++++++++++++++++++++++++ scripts/cluster-init.sh | 0 5 files changed, 374 insertions(+), 75 deletions(-) delete mode 100644 notebooks/Untitled.ipynb create mode 100644 notebooks/fill_test.ipynb create mode 100644 notebooks/r6data.ipynb mode change 100644 => 100755 scripts/cluster-init.sh diff --git a/conf/redis.conf b/conf/redis.conf index 5e45b34..4537a79 100644 --- a/conf/redis.conf +++ b/conf/redis.conf @@ -8,7 +8,7 @@ cluster-enabled yes cluster-config-file nodes.conf cluster-node-timeout 5000 -maxmemory 512mb +#maxmemory 512mb masterauth replication masteruser replication \ No newline at end of file diff --git a/notebooks/Untitled.ipynb b/notebooks/Untitled.ipynb deleted file mode 100644 index 029d133..0000000 --- a/notebooks/Untitled.ipynb +++ /dev/null @@ -1,74 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "05c0de49-0eb8-4cd2-89b2-c312111dda85", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install redis hiredis fastid faker tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90326242-f141-4035-9053-d3aab6cc9224", - "metadata": {}, - "outputs": [], - "source": [ - "from redis.cluster import RedisCluster\n", - "\n", - "client = RedisCluster(host=\"redis\", port=6379)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50b9f473-8924-4d03-acf8-71ecf25e54a8", - "metadata": {}, - "outputs": [], - "source": [ - "from fastid import ulid\n", - "from faker import Faker\n", - "from tqdm.notebook import trange\n", - "faker = Faker()\n", - "\n", - "with client.pipeline() as p:\n", - " for _ in trange(15_000_000):\n", - " p.hset(f\"profile:{ulid()}\", mapping={\"name\": \"John Doe\", \"sex\": \"M\"})\n", - " p.execute()\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23933aa3-ff5d-45f9-b402-59e58c02a2b3", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/fill_test.ipynb b/notebooks/fill_test.ipynb new file mode 100644 index 0000000..6749d6d --- /dev/null +++ b/notebooks/fill_test.ipynb @@ -0,0 +1,133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "05c0de49-0eb8-4cd2-89b2-c312111dda85", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: redis in /opt/conda/lib/python3.13/site-packages (7.4.0)\n", + "Requirement already satisfied: hiredis in /opt/conda/lib/python3.13/site-packages (3.3.1)\n", + "Requirement already satisfied: fastid in /opt/conda/lib/python3.13/site-packages (0.0.5)\n", + "Requirement already satisfied: tqdm in /opt/conda/lib/python3.13/site-packages (4.67.3)\n", + "Requirement already satisfied: kagglehub in /opt/conda/lib/python3.13/site-packages (1.0.0)\n", + "Requirement already satisfied: kagglesdk<1.0,>=0.1.14 in /opt/conda/lib/python3.13/site-packages (from kagglehub) (0.1.16)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.13/site-packages (from kagglehub) (26.0)\n", + "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.13/site-packages (from kagglehub) (6.0.3)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.13/site-packages (from kagglehub) (2.32.5)\n", + "Requirement already satisfied: protobuf in /opt/conda/lib/python3.13/site-packages (from kagglesdk<1.0,>=0.1.14->kagglehub) (6.33.5)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.4.6)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.11)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2.6.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2026.2.25)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install redis hiredis fastid tqdm kagglehub" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "90326242-f141-4035-9053-d3aab6cc9224", + "metadata": {}, + "outputs": [], + "source": [ + "from redis.cluster import RedisCluster\n", + "from redis.commands.search.field import TextField, NumericField, TagField\n", + "from redis.commands.search.index_definition import IndexDefinition, IndexType\n", + "\n", + "\n", + "client = RedisCluster(host=\"redis\", port=6379, username=\"admin\", password=\"admin\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "50b9f473-8924-4d03-acf8-71ecf25e54a8", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "887890574a2048a587b6ed4dd8eef7c6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1000000 [00:00=0.1.14 in /opt/conda/lib/python3.13/site-packages (from kagglehub) (0.1.16)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.13/site-packages (from kagglehub) (26.0)\n", + "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.13/site-packages (from kagglehub) (6.0.3)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.13/site-packages (from kagglehub) (2.32.5)\n", + "Requirement already satisfied: protobuf in /opt/conda/lib/python3.13/site-packages (from kagglesdk<1.0,>=0.1.14->kagglehub) (6.33.5)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.4.6)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (3.11)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2.6.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.13/site-packages (from requests->kagglehub) (2026.2.25)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install redis hiredis fastid tqdm kagglehub" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "526cb932-59aa-489e-bbba-954ac645b633", + "metadata": {}, + "outputs": [], + "source": [ + "from redis.cluster import RedisCluster\n", + "from redis.cache import CacheConfig\n", + "from redis.retry import Retry\n", + "from redis.backoff import ExponentialBackoff\n", + "from pathlib import Path\n", + "import kagglehub\n", + "from csv import DictReader\n", + "from fastid import ulid\n", + "from tqdm.notebook import tqdm\n", + "\n", + "retry = Retry(ExponentialBackoff(), 8)\n", + "client = RedisCluster(\n", + " host=\"redis\",\n", + " port=6379,\n", + " username=\"admin\",\n", + " password=\"admin\",\n", + " retry=retry,\n", + " protocol=3,\n", + " cache_config=CacheConfig(),\n", + " decode_responses=True,\n", + " health_check_interval=3,\n", + " \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8d0875b5-f004-44fc-9519-89fc73c42b77", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path to dataset files: /home/jovyan/.cache/kagglehub/datasets/awesomizer/rainbox-six-siege-dataset/versions/1\n" + ] + } + ], + "source": [ + "dataset = Path(kagglehub.dataset_download(\"awesomizer/rainbox-six-siege-dataset\"))\n", + "\n", + "print(\"Path to dataset files:\", dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "be4c5461-1db2-4226-b71e-cdc7f06615bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "full_dump = dataset/\"datadump_S5\"/\"datadump_S5.csv\"\n", + "objectives = dataset/\"datadump_S5_summary_objectives\"/\"datadump_S5_summary_objectives.csv\"\n", + "operator_loadouts = dataset/\"dataDump_s5_summary_operator_loadout\"/\"dataDump_s5_summary_operator_loadout.csv\"\n", + "display(full_dump.exists())\n", + "display(objectives.exists())\n", + "display(operator_loadouts.exists())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2efb5699-cced-4df0-9d76-278e23874436", + "metadata": {}, + "outputs": [], + "source": [ + "with full_dump.open(\"rb\") as f:\n", + " rows = sum(1 for line in f) - 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9177760b-eec4-4ab4-8242-0dd8df5db9dd", + "metadata": {}, + "outputs": [], + "source": [ + "daily_matches = {}\n", + "with full_dump.open(errors=\"ignore\") as csvfile:\n", + " reader = DictReader(csvfile, delimiter=\";\")\n", + " for i, row in enumerate(tqdm(reader, desc=\"Importing raw data into Redis...\", total=rows, unit=\"row\")):\n", + " if row[\"dateid\"] not in daily_matches: # There's a ton of data so we just select two matches per day\n", + " daily_matches[row[\"dateid\"]] = set((row[\"matchid\"],))\n", + " display(f\"Selected {row[\"matchid\"]} for day {row[\"dateid\"]}\")\n", + " if row[\"matchid\"] not in daily_matches[row[\"dateid\"]]: # Rows from a given day that aren't related to a specific match are ignored\n", + " if len(daily_matches[row[\"dateid\"]]) >= 2: # Skip only if we already have two matches picked\n", + " continue\n", + " daily_matches[row[\"dateid\"]].add(row[\"matchid\"])\n", + " display(f\"Selected {row[\"matchid\"]} for day {row[\"dateid\"]}\")\n", + " client.hset(f\"raw:{ulid()}\", mapping=row)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8b75e35-8bef-4b2b-ae25-1982076c73fd", + "metadata": {}, + "outputs": [], + "source": [ + "from redis.commands.search.field import TagField, NumericField, TextField\n", + "from redis.commands.search.index_definition import IndexDefinition, IndexType\n", + "\n", + "schema = (\n", + " NumericField(\"dateid\", sortable=True, no_index=True),\n", + " TagField(\"platform\"),\n", + " TagField(\"gamemode\"),\n", + " TagField(\"mapname\"),\n", + " NumericField(\"matchid\"),\n", + " NumericField(\"roundnumber\", sortable=True, no_index=True),\n", + " TagField(\"objectivelocation\"),\n", + " TagField(\"winrole\"),\n", + " TagField(\"endroundreason\"),\n", + " NumericField(\"roundduration\", sortable=True, no_index=True),\n", + " NumericField(\"clearancelevel\", sortable=True, no_index=True),\n", + " TagField(\"skillrank\"),\n", + " TagField(\"role\"),\n", + " NumericField(\"team\"),\n", + " NumericField(\"haswon\"),\n", + " TagField(\"operator\"),\n", + " NumericField(\"nbkills\", sortable=True, no_index=True),\n", + " NumericField(\"isdead\"),\n", + " TagField(\"primaryweapon\"),\n", + " TagField(\"primaryweapontype\"),\n", + " TagField(\"primarysight\"),\n", + " TagField(\"primarygrip\"),\n", + " TagField(\"primaryunderbarrel\"),\n", + " TagField(\"primarybarrel\"),\n", + " TagField(\"secondaryweapon\"),\n", + " TagField(\"secondaryweapontype\"),\n", + " TagField(\"secondarysight\"),\n", + " TagField(\"secondarygrip\"),\n", + " TagField(\"secondaryunderbarrel\"),\n", + " TagField(\"secondarybarrel\"),\n", + " TagField(\"secondarygadget\")\n", + ")\n", + "\n", + "client.ft().create_index(schema, definition=IndexDefinition(prefix=(\"raw:\",), index_type=IndexType.HASH))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "015ac04d-5f92-4323-8814-40ec35384a3c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/cluster-init.sh b/scripts/cluster-init.sh old mode 100644 new mode 100755