From de79e13918bc5f611722a9dff6da7421051d5352 Mon Sep 17 00:00:00 2001 From: Erik Mackdanz Date: Sat, 2 Dec 2023 22:02:55 -0600 Subject: [PATCH] some docstring --- README | 3 --- src/main.rs | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 3 deletions(-) delete mode 100644 README diff --git a/README b/README deleted file mode 100644 index 4f12d96..0000000 --- a/README +++ /dev/null @@ -1,3 +0,0 @@ -. ~/var/source-for-polars-test.sh -cargo build && time ./scripts/paraexport -ls out/ diff --git a/src/main.rs b/src/main.rs index 2d5aa1d..baeb091 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,59 @@ +//! A sample map-reduce over a toy dataset +//! +//! # Quick start +//! +//! ```sh +//! # Get AWS credentials that can read/write our S3 bucket +//! . ~/var/source-for-polars-test.sh +//! +//! cargo build && time ./scripts/paraexport +//! ls out/ +//! ``` +//! +//! # How it works +//! +//! The polarsbop should be spawned in parallel by a script like in +//! scripts/paraexport. +//! +//! First a map phase runs has exactly three workers which each take +//! one-third of the countries listed in the input data. This is +//! represented by map_bucket(). The output from each worker is its +//! countries' "Primary Income" value and writes it out to one of nine +//! buckets based on the value. This data, partitioned by worker and +//! bucket, is written to the local filesystem then copied into S3 as +//! a shuffle for the next stage. +//! +//! The second stage is reduce_bucket. It runs with nine workers +//! (must correspond to the output buckets from the map stage). Each +//! worker reads the three files from S3, one output by each map +//! worker. They're read into a single per-bucket dataframe, then the +//! median Primary Income value is output (locally then S3). We have +//! a per-income-bucket aggregation of the Primary Income. +//! +//! The dataset is in data/. It's a CSV export of balance-of-payments +//! data published by the International Monetary Fund. The polars lib +//! is used for processing the data. Hence "polarsbop". +//! +//! # Why? +//! +//! Having recently learned how to use Spark for data ETL pipelining, +//! I'm frustrated with the bloatedness of any JVM program, the cruft +//! of a framework that evolved over many years, and the complexity of +//! Spark's distributed backends. +//! +//! I thought that I could accomplish something similar in a lean rust +//! program and achieve parallelism with basic Unix features. +//! Dependencies are baked into the binary so don't require Spark's +//! mechanism for each worker to download jar libraries. +//! +//! # How could this be better? +//! +//! - Can distribute across hosts by spawning commands in ssh in +//! scripts/paraexport +//! +//! - The intermediate data is in newline-delimited JSON. ORC might +//! be better? + use aws_sdk_s3::primitives::ByteStream; use polars::prelude::*; use smartstring::SmartString; -- 2.52.0