From 020c1e77489b772f854bb3288b9c8d2818a6bf9d Mon Sep 17 00:00:00 2001 From: EuAndreh Date: Fri, 18 Apr 2025 02:17:12 -0300 Subject: git mv src/content/* src/content/en/ --- src/content/en/blog/2018/07/17/guix-nixos.adoc | 197 ++++++++++++ .../en/blog/2018/08/01/npm-ci-reproducibility.adoc | 147 +++++++++ src/content/en/blog/2018/12/21/ytdl-subs.adoc | 279 +++++++++++++++++ .../2019/06/02/nixos-stateless-workstation.adoc | 146 +++++++++ src/content/en/blog/2020/08/10/guix-srht.adoc | 128 ++++++++ .../en/blog/2020/08/31/database-i-wish-i-had.adoc | 299 ++++++++++++++++++ .../en/blog/2020/10/05/cargo2nix-demo.tar.gz | Bin 0 -> 59565 bytes src/content/en/blog/2020/10/05/cargo2nix.adoc | 72 +++++ src/content/en/blog/2020/10/05/cargo2nix.tar.gz | Bin 0 -> 53327 bytes .../en/blog/2020/10/05/swift2nix-demo.tar.gz | Bin 0 -> 61691 bytes src/content/en/blog/2020/10/05/swift2nix.adoc | 194 ++++++++++++ src/content/en/blog/2020/10/05/swift2nix.tar.gz | Bin 0 -> 57917 bytes src/content/en/blog/2020/10/19/feature-flags.adoc | 306 +++++++++++++++++++ .../en/blog/2020/10/20/wrong-interviewing.adoc | 340 +++++++++++++++++++++ src/content/en/blog/2020/11/07/diy-bugs.adoc | 93 ++++++ .../en/blog/2020/11/08/paradigm-shift-review.adoc | 154 ++++++++++ .../en/blog/2020/11/12/database-parsers-trees.adoc | 226 ++++++++++++++ .../en/blog/2020/11/14/local-first-review.adoc | 305 ++++++++++++++++++ .../en/blog/2021/01/26/remembering-ann.adoc | 216 +++++++++++++ src/content/en/blog/2021/02/17/fallible.adoc | 285 +++++++++++++++++ src/content/en/blog/2021/02/17/fallible.tar.gz | Bin 0 -> 1915439 bytes .../en/blog/2021/04/29/relational-review.adoc | 144 +++++++++ src/content/en/blog/categories.adoc | 1 + src/content/en/blog/index.adoc | 1 + 24 files changed, 3533 insertions(+) create mode 100644 src/content/en/blog/2018/07/17/guix-nixos.adoc create mode 100644 src/content/en/blog/2018/08/01/npm-ci-reproducibility.adoc create mode 100644 src/content/en/blog/2018/12/21/ytdl-subs.adoc create mode 100644 src/content/en/blog/2019/06/02/nixos-stateless-workstation.adoc create mode 100644 src/content/en/blog/2020/08/10/guix-srht.adoc create mode 100644 src/content/en/blog/2020/08/31/database-i-wish-i-had.adoc create mode 100644 src/content/en/blog/2020/10/05/cargo2nix-demo.tar.gz create mode 100644 src/content/en/blog/2020/10/05/cargo2nix.adoc create mode 100644 src/content/en/blog/2020/10/05/cargo2nix.tar.gz create mode 100644 src/content/en/blog/2020/10/05/swift2nix-demo.tar.gz create mode 100644 src/content/en/blog/2020/10/05/swift2nix.adoc create mode 100644 src/content/en/blog/2020/10/05/swift2nix.tar.gz create mode 100644 src/content/en/blog/2020/10/19/feature-flags.adoc create mode 100644 src/content/en/blog/2020/10/20/wrong-interviewing.adoc create mode 100644 src/content/en/blog/2020/11/07/diy-bugs.adoc create mode 100644 src/content/en/blog/2020/11/08/paradigm-shift-review.adoc create mode 100644 src/content/en/blog/2020/11/12/database-parsers-trees.adoc create mode 100644 src/content/en/blog/2020/11/14/local-first-review.adoc create mode 100644 src/content/en/blog/2021/01/26/remembering-ann.adoc create mode 100644 src/content/en/blog/2021/02/17/fallible.adoc create mode 100644 src/content/en/blog/2021/02/17/fallible.tar.gz create mode 100644 src/content/en/blog/2021/04/29/relational-review.adoc create mode 100644 src/content/en/blog/categories.adoc create mode 100644 src/content/en/blog/index.adoc (limited to 'src/content/en/blog') diff --git a/src/content/en/blog/2018/07/17/guix-nixos.adoc b/src/content/en/blog/2018/07/17/guix-nixos.adoc new file mode 100644 index 0000000..42290f6 --- /dev/null +++ b/src/content/en/blog/2018/07/17/guix-nixos.adoc @@ -0,0 +1,197 @@ += Running Guix on NixOS + +:install-step: https://www.gnu.org/software/guix/manual/en/html_node/Binary-Installation.html#Binary-Installation + +I wanted to run Guix on a NixOS machine. Even though the Guix manual explains +how to do it {install-step}[step by step], I needed a few extra ones to make it +work properly. + +I couldn't just install GuixSD because my wireless network card doesn't have any +free drivers (yet). + +== Creating `guixbuilder` users + +:manual: https://www.gnu.org/software/guix/manual/en/html_node/Build-Environment-Setup.html#Build-Environment-Setup + +Guix requires you to create non-root users that will be used to perform the +builds in the isolated environments. + +The {manual}[manual] already provides you with a ready to run (as root) command +for creating the build users: + +[source,sh] +---- +groupadd --system guixbuild +for i in `seq -w 1 10`; +do + useradd -g guixbuild -G guixbuild \ + -d /var/empty -s `which nologin` \ + -c "Guix build user $i" --system \ + guixbuilder$i; +done +---- + +:mutable-users: https://nixos.org/nixos/manual/index.html#sec-user-management + +However, In my personal NixOS I have disabled +{mutable-users}[`users.mutableUsers`], which means that even if I run the above +command it means that they'll be removed once I rebuild my OS: + +[source,sh] +---- +$ sudo nixos-rebuild switch +(...) +removing user ‘guixbuilder7’ +removing user ‘guixbuilder3’ +removing user ‘guixbuilder10’ +removing user ‘guixbuilder1’ +removing user ‘guixbuilder6’ +removing user ‘guixbuilder9’ +removing user ‘guixbuilder4’ +removing user ‘guixbuilder2’ +removing user ‘guixbuilder8’ +removing user ‘guixbuilder5’ +(...) +---- + +Instead of enabling `users.mutableUsers` I could add the Guix users by adding +them to my system configuration: + +[source,nix] +---- +{ config, pkgs, ...}: + +{ + + # ... NixOS usual config ellided ... + + users = { + mutableUsers = false; + + extraUsers = + let + andrehUser = { + andreh = { + # my custom user config + }; + }; + buildUser = (i: + { + "guixbuilder${i}" = { # guixbuilder$i + group = "guixbuild"; # -g guixbuild + extraGroups = ["guixbuild"]; # -G guixbuild + home = "/var/empty"; # -d /var/empty + shell = pkgs.nologin; # -s `which nologin` + description = "Guix build user ${i}"; # -c "Guix buid user $i" + isSystemUser = true; # --system + }; + } + ); + in + # merge all users + pkgs.lib.fold (str: acc: acc // buildUser str) + andrehUser + # for i in `seq -w 1 10` + (map (pkgs.lib.fixedWidthNumber 2) (builtins.genList (n: n+1) 10)); + + extraGroups.guixbuild = { + name = "guixbuild"; + }; + }; +} +---- + +Here I used `fold` and the `//` operator to merge all of the configuration sets +into a single `extraUsers` value. + +== Creating the `systemd` service + +:service-file: https://git.savannah.gnu.org/cgit/guix.git/tree/etc/guix-daemon.service.in?id=00c86a888488b16ce30634d3a3a9d871ed6734a2 + +One other thing missing was the `systemd` service. + +First I couldn't just copy the `.service` file to `/etc` since in NixOS that +folder isn't writable. But also I wanted the service to be better integrated +with the OS. + +That was a little easier than creating the users, all I had to do was translate +the provided {service-file}[`guix-daemon.service.in`] configuration to an +equivalent Nix expression: + +[source,ini] +---- +# This is a "service unit file" for the systemd init system to launch +# 'guix-daemon'. Drop it in /etc/systemd/system or similar to have +# 'guix-daemon' automatically started. + +[Unit] +Description=Build daemon for GNU Guix + +[Service] +ExecStart=/var/guix/profiles/per-user/root/guix-profile/bin/guix-daemon --build-users-group=guixbuild +Environment=GUIX_LOCPATH=/root/.guix-profile/lib/locale +RemainAfterExit=yes +StandardOutput=syslog +StandardError=syslog + +# See . +# Some package builds (for example, go@1.8.1) may require even more than +# 1024 tasks. +TasksMax=8192 + +[Install] +WantedBy=multi-user.target +---- + +This sample `systemd` configuration file became: + +[source,nix] +---- +guix-daemon = { + enable = true; + description = "Build daemon for GNU Guix"; + serviceConfig = { + ExecStart = "/var/guix/profiles/per-user/root/guix-profile/bin/guix-daemon --build-users-group=guixbuild"; + Environment="GUIX_LOCPATH=/root/.guix-profile/lib/locale"; + RemainAfterExit="yes"; + StandardOutput="syslog"; + StandardError="syslog"; + TaskMax= "8192"; + }; + wantedBy = [ "multi-user.target" ]; +}; +---- + +There you go! After running `sudo nixos-rebuild switch` I could get Guix up and +running: + +[source,sh] +---- +$ guix package -i hello +The following package will be installed: + hello 2.10 /gnu/store/bihfrh609gkxb9dp7n96wlpigiv3krfy-hello-2.10 + +substitute: updating substitutes from 'https://mirror.hydra.gnu.org'... 100.0% +The following derivations will be built: + /gnu/store/nznmdn6inpwxnlkrasydmda4s2vsp9hg-profile.drv + /gnu/store/vibqrvw4c8lacxjrkqyzqsdrmckv77kq-fonts-dir.drv + /gnu/store/hi8alg7wi0wgfdi3rn8cpp37zhx8ykf3-info-dir.drv + /gnu/store/cvkbp378cvfjikz7mjymhrimv7j12p0i-ca-certificate-bundle.drv + /gnu/store/d62fvxymnp95rzahhmhf456bsf0xg1c6-manual-database.drv +Creating manual page database... +1 entries processed in 0.0 s +2 packages in profile +$ hello +Hello, world! +---- + +:nixos-modules: https://nixos.org/nixos/manual/index.html#sec-writing-modules +:req: https://www.gnu.org/software/guix/manual/en/html_node/Requirements.html#Requirements + +Some improvements to this approach are: + +. looking into {nixos-modules}[NixOS modules] and trying to bundle everything + together into a single logical unit; +. {req}[build Guix from source] and share the Nix store and daemon with Guix. + +Happy Guix/Nix hacking! diff --git a/src/content/en/blog/2018/08/01/npm-ci-reproducibility.adoc b/src/content/en/blog/2018/08/01/npm-ci-reproducibility.adoc new file mode 100644 index 0000000..76bd8e6 --- /dev/null +++ b/src/content/en/blog/2018/08/01/npm-ci-reproducibility.adoc @@ -0,0 +1,147 @@ += Verifying "npm ci" reproducibility +:updatedat: 2019-05-22 + +:empty: +:npm-5: https://blog.npmjs.org/post/161081169345/v500 +:package-locks-old: https://docs.npmjs.com/files/package-locks +:package-lock: https://docs.npmjs.com/files/package-lock.json +:add-npm-ci: https://blog.npmjs.org/post/171556855892/introducing-npm-ci-for-faster-more-reliable +:cli-docs: https://docs.npmjs.com/cli/install#description +:tricky-issue: https://github.com/npm/npm/issues/17979#issuecomment-332701215 + +When {npm-5}[npm@5] came bringing {package-locks-old}[package-locks] with it, I +was confused about the benefits it provided, since running `npm install` more +than once could resolve all the dependencies again and yield yet another fresh +`package-lock.json` file. The message saying "you should add this file to +version control" left me hesitant on what to +do{empty}footnote:package-lock-message[ + {cli-docs}[documentation] claims `npm install` is driven by the existing + `package-lock.json`, but that's actually {tricky-issue}[a little bit tricky]. +]. + +However the {add-npm-ci}[addition of `npm ci`] filled this gap: it's a stricter +variation of `npm install` which guarantees that "{package-lock}[subsequent +installs are able to generate identical trees]". But are they really identical? +I could see that I didn't have the same problems of different installation +outputs, but I didn't know for *sure* if it was really identical. + +== Computing the hash of a directory's content + +:merkle-tree: https://en.wikipedia.org/wiki/Merkle_tree + +I quickly searched for a way to check for the hash signature of an entire +directory tree, but I couldn't find one. I've made a poor man's +{merkle-tree}[Merkle tree] implementation using `sha256sum` and a few piped +commands at the terminal: + +[source,sh] +---- +merkle-tree () { + dirname="${1-.}" + pushd "$dirname" + find . -type f | + sort | + xargs -I{} sha256sum "{}" | + sha256sum | + awk '{print $1}' + popd +} +---- + +Going through it line by line: + +* #1 we define a Bash function called `merkle-tree`; +* #2 it accepts a single argument: the directory to compute the merkle tree from + If nothing is given, it runs on the current directory (`.`); +* #3 we go to the directory, so we don't get different prefixes in `find`'s + output (like `../a/b`); +* #4 we get all files from the directory tree. Since we're using `sha256sum` to + compute the hash of the file contents, we need to filter out folders from it; +* #5 we need to sort the output, since different file systems and `find` + implementations may return files in different orders; +* #6 we use `xargs` to compute the hash of each file individually through + `sha256sum`. Since a file may contain spaces we need to escape it with + quotes; +* #7 we compute the hash of the combined hashes. Since `sha256sum` output is + formatted like ` `, it produces a different final hash if a + file ever changes name without changing it's content; +* #8 we get the final hash output, excluding the `` (which is `-` in + this case, aka `stdin`). + +=== Positive points: + +. ignore timestamp: running more than once on different installation yields the + same hash; +. the name of the file is included in the final hash computation. + +=== Limitations: + +. it ignores empty folders from the hash computation; +. the implementation's only goal is to represent using a digest whether the + content of a given directory is the same or not. Leaf presence checking is + obviously missing from it. + +=== Testing locally with sample data + +[source,sh] +---- +mkdir /tmp/merkle-tree-test/ +cd /tmp/merkle-tree-test/ +mkdir -p a/b/ a/c/ d/ +echo "one" > a/b/one.txt +echo "two" > a/c/two.txt +echo "three" > d/three.txt +merkle-tree . # output is be343bb01fe00aeb8fef14a3e16b1c3d1dccbf86d7e41b4753e6ccb7dc3a57c3 +merkle-tree . # output still is be343bb01fe00aeb8fef14a3e16b1c3d1dccbf86d7e41b4753e6ccb7dc3a57c3 +echo "four" > d/four.txt +merkle-tree . # output is now b5464b958969ed81815641ace96b33f7fd52c20db71a7fccc45a36b3a2ae4d4c +rm d/four.txt +merkle-tree . # output back to be343bb01fe00aeb8fef14a3e16b1c3d1dccbf86d7e41b4753e6ccb7dc3a57c3 +echo "hidden-five" > a/b/one.txt +merkle-tree . # output changed 471fae0d074947e4955e9ac53e95b56e4bc08d263d89d82003fb58a0ffba66f5 +---- + +It seems to work for this simple test case. + +You can try copying and pasting it to verify the hash signatures. + +== Using `merkle-tree` to check the output of `npm ci` + +_I've done all of the following using Node.js v8.11.3 and npm@6.1.0_. + +In this test case I'll take the main repo of +https://lernajs.io/[Lerna]footnote:lerna-package-lock[ + Finding a big known repo that actually committed the `package-lock.json` file + was harder than I expected. +]: + +```bash +cd /tmp/ +git clone https://github.com/lerna/lerna.git +cd lerna/ +git checkout 57ff865c0839df75dbe1974971d7310f235e1109 +npm ci +merkle-tree node_modules/ # outputs 11e218c4ac32fac8a9607a8da644fe870a25c99821167d21b607af45699afafa +rm -rf node_modules/ +npm ci +merkle-tree node_modules/ # outputs 11e218c4ac32fac8a9607a8da644fe870a25c99821167d21b607af45699afafa +npm ci # test if it also works with an existing node_modules/ folder +merkle-tree node_modules/ # outputs 11e218c4ac32fac8a9607a8da644fe870a25c99821167d21b607af45699afafa +``` + +Good job `npm ci` :) + +#6 and #9 take some time to run (21 seconds in my machine), but this specific +use case isn't performance sensitive. The slowest step is computing the hash of +each individual file. + +== Conclusion + +`npm ci` really "generates identical trees". + +I'm not aware of any other existing solution for verifying the hash signature of +a directory. If you know any, shoot me an email, as I'd like to know it. + +== *Edit* + +2019-05-22: Fix spelling. diff --git a/src/content/en/blog/2018/12/21/ytdl-subs.adoc b/src/content/en/blog/2018/12/21/ytdl-subs.adoc new file mode 100644 index 0000000..10afbf6 --- /dev/null +++ b/src/content/en/blog/2018/12/21/ytdl-subs.adoc @@ -0,0 +1,279 @@ += Using "youtube-dl" to manage YouTube subscriptions + +:ytsm-ann: https://old.reddit.com/r/DataHoarder/comments/9sg8q5/i_built_a_selfhosted_youtube_subscription_manager/ +:ytsm-code: https://github.com/chibicitiberiu/ytsm +:ytdl: https://youtube-dl.org/ + +I've recently read the {ytsm-ann}[announcement] of a very nice +{ytsm-code}[self-hosted YouTube subscription manager]. I haven't used YouTube's +built-in subscriptions for a while now, and haven't missed it at all. When I +saw the announcement, I considered writing about the solution I've built on top +of {ytdl}[youtube-dl]. + +== Background: the problem with YouTube + +:net-giants: https://staltz.com/what-happens-when-you-block-internet-giants.html + +In many ways, I agree with {net-giants}[André Staltz's view on data ownership +and privacy]: + +____ +I started with the basic premise that "I want to be in control of my data". +Sometimes that meant choosing when to interact with an internet giant and how +much I feel like revealing to them. Most of times it meant not interacting with +them at all. I don't want to let them be in full control of how much they can +know about me. I don't want to be in autopilot mode. (...) Which leads us to +YouTube. While I was able to find alternatives to Gmail (Fastmail), Calendar +(Fastmail), Translate (Yandex Translate), _etc._ YouTube remains as the most +indispensable Google-owned web service. It is really really hard to avoid +consuming YouTube content. It was probably the smartest startup acquisition +ever. My privacy-oriented alternative is to watch YouTube videos through Tor, +which is technically feasible but not polite to use the Tor bandwidth for these +purposes. I'm still scratching my head with this issue. +____ + +Even though I don't use most alternative services he mentions, I do watch videos +from YouTube. But I also feel uncomfortable logging in to YouTube with a Google +account, watching videos, creating playlists and similar things. + +Using the mobile app is worse: you can't even block ads in there. You're in +less control on what you share with YouTube and Google. + +== youtube-dl + +:other-sites: https://rg3.github.io/youtube-dl/supportedsites.html + +youtube-dl is a command-line tool for downloading videos, from YouTube and +{other-sites}[many other sites]: + +[source,sh] +---- +$ youtube-dl https://www.youtube.com/watch?v=rnMYZnY3uLA +[youtube] rnMYZnY3uLA: Downloading webpage +[youtube] rnMYZnY3uLA: Downloading video info webpage +[download] Destination: A Origem da Vida _ Nerdologia-rnMYZnY3uLA.mp4 +[download] 100% of 32.11MiB in 00:12 +---- + +It can be used to download individual videos as showed above, but it also has +some interesting flags that we can use: + +* `--output`: use a custom template to create the name of the downloaded file; +* `--download-archive`: use a text file for recording and remembering which + videos were already downloaded; +* `--prefer-free-formats`: prefer free video formats, like `webm`, `ogv` and + Matroska `mkv`; +* `--playlist-end`: how many videos to download from a "playlist" (a channel, a + user or an actual playlist); +* `--write-description`: write the video description to a `.description` file, + useful for accessing links and extra content. + +Putting it all together: + +[source,sh] +---- +$ youtube-dl "https://www.youtube.com/channel/UClu474HMt895mVxZdlIHXEA" \ + --download-archive ~/Nextcloud/cache/youtube-dl-seen.conf \ + --prefer-free-formats \ + --playlist-end 20 \ + --write-description \ + --output "~/Downloads/yt-dl/%(uploader)s/%(upload_date)s - %(title)s.%(ext)s" +---- + +This will download the latest 20 videos from the selected channel, and write +down the video IDs in the `youtube-dl-seen.conf` file. Running it immediately +after one more time won't have any effect. + +If the channel posts one more video, running the same command again will +download only the last video, since the other 19 were already downloaded. + +With this basic setup you have a minimal subscription system at work, and you +can create some functions to help you manage that: + +[source,sh] +---- +#!/bin/sh + +export DEFAULT_PLAYLIST_END=15 + +download() { + youtube-dl "$1" \ + --download-archive ~/Nextcloud/cache/youtube-dl-seen.conf \ + --prefer-free-formats \ + --playlist-end "$2" \ + --write-description \ + --output "~/Downloads/yt-dl/%(uploader)s/%(upload_date)s - %(title)s.%(ext)s" +} +export -f download + + +download_user() { + download "https://www.youtube.com/user/$1" "${2-$DEFAULT_PLAYLIST_END}" +} +export -f download_user + + +download_channel() { + download "https://www.youtube.com/channel/$1" "${2-$DEFAULT_PLAYLIST_END}" +} +export -f download_channel + + +download_playlist() { + download "https://www.youtube.com/playlist?list=$1" "${2-$DEFAULT_PLAYLIST_END}" +} +export -f download_playlist +---- + +With these functions, you now can have a subscription fetching script to +download the latest videos from your favorite channels: + +[source,sh] +---- +#!/bin/sh + +download_user ClojureTV 15 +download_channel 'UCmEClzCBDx-vrt0GuSKBd9g' 100 +download_playlist 'PLqG7fA3EaMRPzL5jzd83tWcjCUH9ZUsbX' 15 +---- + +Now, whenever you want to watch the latest videos, just run the above script +and you'll get all of them in your local machine. + +== Tradeoffs + +=== I've made it for myself, with my use case in mind + + +[qanda] +Offline:: +My internet speed it somewhat +reasonable{empty}footnote:internet-speed[ + Considering how expensive it is and the many ways it could be better, but also + how much it has improved over the last years, I say it's reasonable. +], but it is really unstable. Either at work or at home, it's not uncommon to +loose internet access for 2 minutes 3~5 times every day, and stay completely +offline for a couple of hours once every week. ++ +Working through the hassle of keeping a playlist on disk has payed off many, +many times. Sometimes I even not notice when the connection drops for some +minutes, because I'm watching a video and working on some document, all on my +local computer. ++ +There's also no quality adjustment for YouTube's web player, I always pick the +higher quality and it doesn't change during the video. For some types of +content, like a podcast with some tiny visual resources, this doesn't change +much. For other types of content, like a keynote presentation with text written +on the slides, watching on 144p isn't really an option. ++ +If the internet connection drops during the video download, youtube-dl will +resume from where it stopped. ++ +This is an offline first benefit that I really like, and works well for me. + + +Sync the "seen" file:: +I already have a running instance of Nextcloud, so just dumping the +`youtube-dl-seen.conf` file inside Nextcloud was a no-brainer. ++ +You could try putting it in a dedicated git repository, and wrap the script with +an autocommit after every run. If you ever had a merge conflict, you'd simply +accept all changes and then run the following to tidy up the file: ++ +[source,sh] +---- +$ uniq youtube-dl-seen.conf > youtube-dl-seen.conf +---- + + +Doesn't work on mobile:: +My primary device that I use everyday is my laptop, not my phone. It works well +for me this way. ++ +Also, it's harder to add ad-blockers to mobile phones, and most mobile software +still depends on Google's and Apple's blessing. ++ +If you wish, you can sync the videos to the SD card periodically, but that's a +bit of extra manual work. + + +=== The Good + + +[qanda] +Better privacy:: +We don't even have to configure the ad-blocker to keep ads and trackers away! ++ +YouTube still has your IP address, so using a VPN is always a good idea. +However, a timing analysis would be able to identify you (considering the +current implementation). + + +No need to self-host:: +There's no host that needs maintenance. Everything runs locally. ++ +As long as you keep youtube-dl itself up to date and sync your "seen" file, +there's little extra work to do. + + +Track your subscriptions with git:: +After creating a `subscriptions.sh` executable that downloads all the videos, +you can add it to git and use it to track metadata about your subscriptions. + + +=== The Bad + + +[qanda] +Maximum playlist size is your disk size:: +This is a good thing for getting a realistic view on your actual "watch later" +list. However I've run out of disk space many times, and now I need to be more +aware of how much is left. + + +=== The Ugly + +We can only avoid all the bad parts of YouTube with youtube-dl as long as +YouTube keeps the videos public and programmatically accessible. If YouTube +ever blocks that we'd loose the ability to consume content this way, but also +loose confidence on considering YouTube a healthy repository of videos on the +internet. + + +== Going beyond + +Since you're running everything locally, here are some possibilities to be +explored: + + +=== A playlist that is too long for being downloaded all at once + +You can wrap the `download_playlist` function (let's call the wrapper +`inc_download`) and instead of passing it a fixed number to the `--playlist-end` +parameter, you can store the `$n` in a folder (something like +`$HOME/.yt-db/$PLAYLIST_ID`) and increment it by `$step` every time you run +`inc_download`. + +This way you can incrementally download videos from a huge playlist without +filling your disk with gigabytes of content all at once. + + +=== Multiple computer scenario + +The `download_playlist` function could be aware of the specific machine that it +is running on and apply specific policies depending on the machine: always +download everything; only download videos that aren't present anywhere else; +_etc._ + + +== Conclusion + +youtube-dl is a great tool to keep at hand. It covers a really large range of +video websites and works robustly. + +Feel free to copy and modify this code, and send me suggestions of improvements +or related content. + +== _Edit_ + +2019-05-22: Fix spelling. diff --git a/src/content/en/blog/2019/06/02/nixos-stateless-workstation.adoc b/src/content/en/blog/2019/06/02/nixos-stateless-workstation.adoc new file mode 100644 index 0000000..f89a106 --- /dev/null +++ b/src/content/en/blog/2019/06/02/nixos-stateless-workstation.adoc @@ -0,0 +1,146 @@ += Using NixOS as an stateless workstation + +:empty: +:nixos: https://nixos.org/ + +Last +week{empty}footnote:last-week[ + "Last week" as of the start of this writing, so around the end of May 2019. +] I changed back to an +old{empty}footnote:old-computer[ + I was using a 32GB RAM, i7 and 250GB SSD Samsung laptop. The switch was back + to a 8GB RAM, i5 and 500GB HDD Dell laptop. The biggest difference I noticed + was on faster memory, both RAM availability and the disk speed, but I had + 250GB less local storage space. +] Samsung laptop, and installed {nixos}[NixOS] on it. + +After using NixOS on another laptop for around two years, I wanted verify how +reproducible was my desktop environment, and how far does NixOS actually can go +on recreating my whole OS from my configuration files and personal data. I +gravitated towards NixOS after trying (and failing) to create an `install.sh` +script that would imperatively install and configure my whole OS using apt-get. +When I found a GNU/Linux distribution that was built on top of the idea of +declaratively specifying the whole OS I was automatically +convinced{empty}footnote:convincend-by-declarative-aspect[ + The declarative configuration aspect is something that I now completely take + for granted, and wouldn't consider using something which isn't declarative. A + good metric to show this is me realising that I can't pinpoint the moment when + I decided to switch to NixOS. It's like I had a distant past when this wasn't + true. +]. + +I was impressed. Even though I've been experiencing the benefits of Nix +isolation daily, I always felt skeptical that something would be missing, +because the devil is always on the details. But the result was much better than +expected! + +There were only 2 missing configurations: + +. tap-to-click on the touchpad wasn't enabled by default; +. the default theme from the gnome-terminal is "Black on white" instead of + "White on black". + +That's all. + +I haven't checked if I can configure those in NixOS GNOME module, but I guess +both are scriptable and could be set in a fictional `setup.sh` run. + +This makes me really happy, actually. More happy than I anticipated. + +Having such a powerful declarative OS makes me feel like my data is the really +important stuff (as it should be), and I can interact with it on any +workstation. All I need is an internet connection and a few hours to download +everything. It feels like my physical workstation and the installed OS are +serving me and my data, instead of me feeling as hostage to the specific OS +configuration at the moment. Having a few backup copies of everything important +extends such peacefulness. + +After this positive experience with recreating my OS from simple Nix +expressions, I started to wonder how far I could go with this, and started +considering other areas of improvements: + +== First run on a fresh NixOS installation + +Right now the initial setup relies on non-declarative manual tasks, like +decrypting some credentials, or manually downloading *this* git repository with +specific configurations before *that* one. + +I wonder what some areas of improvements are on this topic, and if investing on +it is worth it (both time-wise and happiness-wise). + +== Emacs + +:spacemacs: https://spacemacs.org/ +:emacs: https://www.gnu.org/software/emacs/ +:layers: https://spacemacs.org/doc/LAYERS.html +:there: https://nixos.org/nixos/manual/index.html#module-services-emacs-adding-packages +:packages: https://www.gnu.org/software/guix/manual/en/html_node/Application-Setup.html#Emacs-Packages + +Right now I'm using the {spacemacs}[Spacemacs], which is a community package +curation and configuration on top of {emacs}[Emacs]. + +Spacemacs does support the notion of {layers}[layers], which you can +declaratively specify and let Spacemacs do the rest. + +However this solution isn't nearly as robust as Nix: being purely functional, +Nix does describe everything required to build a derivation, and knows how to do +so. Spacemacs it closer to more traditional package managers: even though the +layers list is declarative, the installation is still very much imperative. +I've had trouble with Spacemacs not behaving the same on different computers, +both with identical configurations, only brought to convergence back again after +a `git clean -fdx` inside `~/.emacs.d/`. + +The ideal solution would be managing Emacs packages with Nix itself. After a +quick search I did found that {there}[there is support for Emacs packages in +Nix]. So far I was only aware of {packages}[Guix support for Emacs packages]. + +This isn't a trivial change because Spacemacs does include extra curation and +configuration on top of Emacs packages. I'm not sure the best way to improve +this right now. + +== myrepos + +:myrepos: https://myrepos.branchable.com/ + +I'm using {myrepos}[myrepos] to manage all my git repositories, and the general +rule I apply is to add any repository specific configuration in myrepos' +`checkout` phase: + +[source,sh] +---- +# sample ~/.mrconfig file snippet +[dev/guix/guix] +checkout = + git clone https://git.savannah.gnu.org/git/guix.git guix + cd guix/ + git config sendemail.to guix-patches@gnu.org +---- + +This way when I clone this repo again the email sending is already +pre-configured. + +This works well enough, but the solution is too imperative, and my `checkout` +phases tend to become brittle over time if not enough care is taken. + +== GNU Stow + +:not-at-all: https://euandre.org/git/dotfiles/tree/bash/symlinks.sh?id=316939aa215181b1d22b69e94241eef757add98d +:stow: https://www.gnu.org/software/stow/ + +For my home profile and personal configuration I already have a few dozens of +symlinks that I manage manually. This has worked so far, but the solution is +sometimes fragile and {not-at-all}[not declarative at all]. I wonder if +something like {stow}[GNU Stow] can help me simplify this. + +== Conclusion + +:nix: https://nixos.org/nix/ + +I'm really satisfied with NixOS, and I intend to keep using it. If what I've +said interests you, maybe try tinkering with the {nix}[Nix package manager] (not +the whole NixOS) on your current distribution (it can live alongside any other +package manager). + +If you have experience with declarative Emacs package managements, GNU Stow or +any similar tool, _etc._, mail me some tips]. If you don't have any experience +at all, I'd still love to hear from you. diff --git a/src/content/en/blog/2020/08/10/guix-srht.adoc b/src/content/en/blog/2020/08/10/guix-srht.adoc new file mode 100644 index 0000000..a89e86e --- /dev/null +++ b/src/content/en/blog/2020/08/10/guix-srht.adoc @@ -0,0 +1,128 @@ += Guix inside sourcehut builds.sr.ht CI +:updatedat: 2020-08-19 + +:nixos: https://man.sr.ht/builds.sr.ht/compatibility.md#nixos +:guix: https://guix.gnu.org/ +:binary-inst: https://guix.gnu.org/manual/en/guix.html#Binary-Installation +:shell-inst: https://git.savannah.gnu.org/cgit/guix.git/plain/etc/guix-install.sh + +After the release of the {nixos}[NixOS images in builds.sr.ht] and much usage of +it, I also started looking at {guix}[Guix] and wondered if I could get it on the +awesome builds.sr.ht service. + +The Guix manual section on the {binary-inst}[binary installation] is very +thorough, and even a {shell-inst}[shell installer script] is provided, but it is +built towards someone installing Guix on their personal computer, and relies +heavily on interactive input. + +I developed the following set of scripts that I have been using for some time to +run Guix tasks inside builds.sr.ht jobs. First, `install-guix.sh`: + +[source,sh] +---- +#!/usr/bin/env bash +set -x +set -Eeuo pipefail + +VERSION='1.0.1' +SYSTEM='x86_64-linux' +BINARY="guix-binary-${VERSION}.${SYSTEM}.tar.xz" + +cd /tmp +wget "https://ftp.gnu.org/gnu/guix/${BINARY}" +tar -xf "${BINARY}" + +sudo mv var/guix /var/ +sudo mv gnu / +sudo mkdir -p ~root/.config/guix +sudo ln -fs /var/guix/profiles/per-user/root/current-guix ~root/.config/guix/current + +GUIX_PROFILE="$(echo ~root)/.config/guix/current" +source "${GUIX_PROFILE}/etc/profile" + +groupadd --system guixbuild +for i in $(seq -w 1 10); +do + useradd -g guixbuild \ + -G guixbuild \ + -d /var/empty \ + -s "$(command -v nologin)" \ + -c "Guix build user ${i}" --system \ + "guixbuilder${i}"; +done + +mkdir -p /usr/local/bin +cd /usr/local/bin +ln -s /var/guix/profiles/per-user/root/current-guix/bin/guix . +ln -s /var/guix/profiles/per-user/root/current-guix/bin/guix-daemon . + +guix archive --authorize < ~root/.config/guix/current/share/guix/ci.guix.gnu.org.pub +---- + +Almost all of it is taken directly from the {binary-inst}[binary installation] +section from the manual, with the interactive bits stripped out: after +downloading and extracting the Guix tarball, we create some symlinks, add +guixbuild users and authorize the `ci.guix.gnu.org.pub` signing key. + +After installing Guix, we perform a `guix pull` to update Guix inside +`start-guix.sh`: + +[source,sh] +---- +#!/usr/bin/env bash +set -x +set -Eeuo pipefail + +sudo guix-daemon --build-users-group=guixbuild & +guix pull +guix package -u +guix --version +---- + +Then we can put it all together in a sample `.build.yml` configuration file I'm +using myself: + +[source,yaml] +---- +image: debian/stable +packages: + - wget +sources: + - https://git.sr.ht/~euandreh/songbooks +tasks: + - install-guix: | + cd ./songbooks/ + ./scripts/install-guix.sh + ./scripts/start-guix.sh + echo 'sudo guix-daemon --build-users-group=guixbuild &' >> ~/.buildenv + echo 'export PATH="${HOME}/.config/guix/current/bin${PATH:+:}$PATH"' >> ~/.buildenv + - tests: | + cd ./songbooks/ + guix environment -m build-aux/guix.scm -- make check + - docs: | + cd ./songbooks/ + guix environment -m build-aux/guix.scm -- make publish-dist +---- + +We have to add the `guix-daemon` to `~/.buildenv` so it can be started on every +following task run. Also, since we used `wget` inside `install-guix.sh`, we had +to add it to the images package list. + +After the `install-guix` task, you can use Guix to build and test your project, +or run any `guix environment --ad-hoc my-package -- my script` :) + +== Improvements + +:repository: https://git.sr.ht/~sircmpwn/builds.sr.ht + +When I originally created this code I had a reason why to have both a `sudo` +call for `sudo ./scripts/install-guix.sh` and `sudo` usages inside +`install-guix.sh` itself. I couldn't figure out why (it feels like my past self +was a bit smarter 😬), but it feels ugly now. If it is truly required I could +add an explanation for it, or remove this entirely in favor of a more elegant +solution. + +I could also contribute the Guix image upstream to builds.sr.ht, but there +wasn't any build or smoke tests in the original {repository}[repository], so I +wasn't inclined to make something that just ``works on my machine'' or add a +maintainence burden to the author. I didn't look at it again recently, though. diff --git a/src/content/en/blog/2020/08/31/database-i-wish-i-had.adoc b/src/content/en/blog/2020/08/31/database-i-wish-i-had.adoc new file mode 100644 index 0000000..7f010b9 --- /dev/null +++ b/src/content/en/blog/2020/08/31/database-i-wish-i-had.adoc @@ -0,0 +1,299 @@ += The database I wish I had +:categories: mediator +:updatedat: 2020-09-03 + +:empty: +:values-talk: https://vimeo.com/230142234 +:haskell-startup: https://www.youtube.com/watch?v=ZR3Jirqk6W8 + +I watched the talk "{values-talk}[Platform as a Reflection of Values: Joyent, +Node.js and beyond]" by Bryan Cantrill, and I think he was able to put into +words something I already felt for some time: if there's no piece of software +out there that reflects your values, it's time for you to build that +software{empty}footnote:talk-time[ + At the very end, at time 29:49. When talking about the draft of this article + with a friend, he noted that Bryan O'Sullivan (a different Bryan) says a + similar thing on his talk "{haskell-startup}[Running a startup on Haskell]", + at time 4:15. +]. + +I kind of agree with what he said, because this is already happening to me. I +long for a database with a certain set of values, and for a few years I was just +waiting for someone to finally write it. After watching his talk, Bryan is +saying to me: "time to stop waiting, and start writing it yourself". + +So let me try to give an overview of such database, and go over its values. + +== Overview + +I want a database that allows me to create decentralized client-side +applications that can sync data. + +The best one-line description I can give right now is: + +____ +It's sort of like PouchDB, Git, Datomic, SQLite and Mentat. +____ + +A more descriptive version could be: + +____ +An embedded, immutable, syncable relational database. +____ + +Let's go over what I mean by each of those aspects one by one. + +=== Embedded + +:sqlite: https://sqlite.org/index.html +:sqlite-whentouse: https://sqlite.org/whentouse.html +:pouchdb: https://pouchdb.com/ +:couchdb: https://couchdb.apache.org/ +:mentat: https://github.com/mozilla/mentat +:pouchdb-adapters: https://pouchdb.com/adapters.html +:datomic-storage-services: https://docs.datomic.com/on-prem/storage.html +:sqlite-amalgamation: https://www.sqlite.org/amalgamation.html +:pointed-out: https://news.ycombinator.com/item?id=24338881 + +I think the server-side database landscape is diverse and mature enough for my +needs (even though I end up choosing SQLite most of the time), and what I'm +after is a database to be embedded on client-side applications itself, be it +desktop, browser, mobile, _etc._ + +The purpose of such database is not to keep some local cache of data in case of +lost connectivity: we have good solutions for that already. It should serve as +the source of truth, and allow the application to work on top of it. + +{sqlite}[*SQLite*] is a great example of that: it is a very powerful relational +database that runs {sqlite-whentouse}[almost anywhere]. What I miss from it +that SQLite doesn't provide is the ability to run it on the browser: even though +you could compile it to WebAssembly, [line-through]#it assumes a POSIX +filesystem that would have to be +emulated#{empty}footnote:posix-sqlite[ + It was {pointed-out}[pointed out to me] that SQLite doesn't assume the + existence of a POSIX filesystem, as I wrongly stated. Thanks for the + correction. +pass:[

] + This makes me consider it as a storage backend all by itself. I initially + considered having an SQLite storage backend as one implementation of the POSIX + filesystem storage API that I mentioned. My goal was to rely on it so I could + validate the correctness of the actual implementation, given SQLite's + robustness. +pass:[

] + However it may even better to just use SQLite, and get an ACID backend without + recreating a big part of SQLite from scratch. In fact, both Datomic and + PouchDB didn't create an storage backend for themselves, they just plugged on + what already existed and already worked. I'm beginning to think that it would + be wiser to just do the same, and drop entirely the from scratch + implementation that I mentioned. +pass:[

] + That's not to say that adding an IndexedDB compatibility layer to SQLite would + be enough to make it fit the other requirements I mention on this page. SQLite + still is an implementation of a update-in-place, SQL, table-oriented database. + It is probably true that cherry-picking the relevant parts of SQLite (like + storage access, consistency, crash recovery, parser generator, *etc.*) and + leaving out the unwanted parts (SQL, tables, threading, *etc.*) would be + better than including the full SQLite stack, that's simply an optimization. + Both could even coexist, if desired. +pass:[

] + SQLite would have to be treated similarly to how Datomic treats SQL databases: + instead of having a table for each entities, spread attributes over the + tables, *etc.*, it treats SQL databases as a key-value storage so it doesn't + have to re-implement interacting with the disk that other databases do well. +pass:[

] + The tables would contain blocks of binary data, so there isn't a difference on + how the SQLite storage backend behaves and how the IndexedDB storage backend + behaves, much like how Datomic works the same regardless of the storage + backend, same for PouchDB. +pass:[

] + I welcome corrections on what I said above, too. +]. + +{pouchdb}[*PouchDB*] is another great example: it's a full reimplementation of +{couchdb}[CouchDB] that targets JavaScript environments, mainly the browser and +Node.js. However I want a tool that can be deployed anywhere, and not limit its +applications to places that already have a JavaScript runtime environment, or +force the developer to bundle a JavaScript runtime environment with their +application. This is true for GTK+ applications, command line programs, Android +apps, _etc._ + +{mentat}[*Mentat*] was an interesting project, but its reliance on SQLite makes +it inherit most of the downsides (and benefits too) of SQLite itself. + +Having such a requirement imposes a different approach to storage: we have to +decouple the knowledge about the intricacies of storage from the usage of +storage itself, so that a module (say query processing) can access storage +through an API without needing to know about its implementation. This allows +the database to target a POSIX filesystems storage API and an IndexedDB storage +API, and make the rest of the code agnostic about storage. PouchDB has such +mechanism (called {pouchdb-adapters}[adapters]) and Datomic has them too (called +{datomic-storage-services}[storage services]). + +This would allow the database to adapt to where it is embedded: when targeting +the browser the IndexedDB storage API would provide the persistence layer that +the database requires, and similarly the POSIX filesystem storage API would +provide the persistence layer when targeting POSIX systems (like desktops, +mobile, _etc._). + +But there's also an extra restriction that comes from by being embedded: it +needs to provide and embeddable artifact, most likely a binary library object +that exposes a C compatible FFI, similar to {sqlite-amalgamation}[how SQLite +does]. Bundling a full runtime environment is possible, but doesn't make it a +compelling solution for embedding. This rules out most languages, and leaves +us with C, Rust, Zig, and similar options that can target POSIX systems and +WebAssembly. + +=== Immutable + +:datomic: https://www.datomic.com/ +:day-of-datomic: https://vimeo.com/116315075 +:git: https://git-scm.com/ +:sqlite-limits: https://sqlite.org/limits.html +:datomic-no-history: https://docs.datomic.com/cloud/best.html#nohistory-for-high-churn + +Being immutable means that only new information is added, no in-place update +ever happens, and nothing is ever deleted. + +Having an immutable database presents us with similar trade-offs found in +persistent data structures, like lack of coordination when doing reads, caches +being always coherent, and more usage of space. + +{datomic}[*Datomic*] is the go to database example of this: it will only add +information (datoms) and allows you to query them in a multitude of ways. +Stuart Halloway calls it "accumulate-only" over +"append-only"{empty}footnote:accumulate-only[ + Video "{day-of-datomic}[Day of Datomic Part 2]" on Datomic's information + model, at time 12:28. +]: + +____ +It's accumulate-only, it is not append-only. So append-only, most people when +they say that they're implying something physical about what happens. +____ + +Also a database can be append-only and overwrite existing information with new +information, by doing clean-ups of "stale" data. I prefer to adopt the +"accumulate-only" naming and approach. + +{git}[*Git*] is another example of this: new commits are always added on top of +the previous data, and it grows by adding commits instead of replacing existing +ones. + +Git repositories can only grow in size, and that is not only an acceptable +condition, but also one of the reasons to use it. + +All this means that no in-place updates happens on data, and the database will +be much more concerned about how compact and efficiently it stores data than how +fast it does writes to disk. Being embedded, the storage limitation is either +a) how much storage the device has or b) how much storage was designed for the +application to consume. So even though the database could theoretically operate +with hundreds of TBs, a browser page or mobile application wouldn't have access +to this amount of storage. SQLite even {sqlite-limits}[says] that it does +support approximately 280 TBs of data, but those limits are untested. + +The upside of keeping everything is that you can have historical views of your +data, which is very powerful. This also means that applications should turn +this off when not +relevant{empty}footnote:no-history[ + Similar to {datomic-no-history}[Datomic's `:db/noHistory`]. +]. + +=== Syncable + +:3-way-merge: https://en.wikipedia.org/wiki/Merge_(version_control) +:git-remote-gcrypt: https://spwhitton.name/tech/code/git-remote-gcrypt/ + +This is a frequent topic when talking about offline-first solutions. When +building applications that: + +* can fully work offline, +* stores data, +* propagates that data to other application instances, + +then you'll need a conflict resolution strategy to handle all the situations +where different application instances disagree. Those application instances +could be a desktop and a browser version of the same application, or the same +mobile app in different devices. + +A three-way merge seems to be the best approach, on top of which you could add +application specific conflict resolution functions, like: + +* pick the change with higher timestamp; +* if one change is a delete, pick it; +* present the diff on the screen and allow the user to merge them. + +Some databases try to make this "easy", by choosing a strategy for you, but I've +found that different applications require different conflict resolution +strategies. Instead, the database should leave this up to the user to decide, +and provide tools for them to do it. + +{3-way-merge}[*Three-way merges in version control*] are the best example, +performing automatic merges when possible and asking the user to resolve +conflicts when they appear. + +The unit of conflict for a version control system is a line of text. The +database equivalent would probably be a single attribute, not a full entity or a +full row. + +Making all the conflict resolution logic be local should allow the database to +have encrypted remotes similar to how {git-remote-gcrypt}[git-remote-gcrypt] +adds this functionality to Git. This would enable users to sync the application +data across devices using an untrusted intermediary. + +=== Relational + +:datomic-datalog: https://docs.datomic.com/on-prem/query.html +:datomic-model: https://docs.datomic.com/cloud/whatis/data-model.html#datoms + +I want the power of relational queries on the client applications. + +Most of the arguments against traditional table-oriented relational databases +are related to write performance, but those don't apply here. The bottlenecks +for client applications usually aren't write throughput. Nobody is interested +in differentiating between 1 MB/s or 10 MB/s when you're limited to 500 MB +total. + +The relational model of the database could either be based on SQL and tables +like in SQLite, or maybe {datomic-datalog}[datalog] and {datomic-model}[datoms] +like in Datomic. + +== From aspects to values + +Now let's try to translate the aspects above into values, as suggested by Bryan +Cantrill. + +=== Portability + +Being able to target so many different platforms is a bold goal, and the +embedded nature of the database demands portability to be a core value. + +=== Integrity + +When the local database becomes the source of truth of the application, it must +provide consistency guarantees that enables applications to rely on it. + +=== Expressiveness + +The database should empower applications to slice and dice the data in any way +it wants to. + +== Next steps + +Since I can't find any database that fits these requirements, I've finally come +to terms with doing it myself. + +It's probably going to take me a few years to do it, and making it portable +between POSIX and IndexedDB will probably be the biggest challenge. I got +myself a few books on databases to start. + +I wonder if I'll ever be able to get this done. + +== External links + +:reddit: https://old.reddit.com/r/programming/comments/ijwz5b/the_database_i_wish_i_had/ +:lobsters: https://lobste.rs/s/m9vkg4/database_i_wish_i_had +:hn: https://news.ycombinator.com/item?id=24337244 +:list: https://lists.sr.ht/~euandreh/public-inbox/%3C010101744a592b75-1dce9281-f0b8-4226-9d50-fd2c7901fa72-000000%40us-west-2.amazonses.com%3E + +See discussions on {reddit}[Reddit], {lobsters}[lobsters], {hn}[HN] and {list}[a +lengthy email exchange]. diff --git a/src/content/en/blog/2020/10/05/cargo2nix-demo.tar.gz b/src/content/en/blog/2020/10/05/cargo2nix-demo.tar.gz new file mode 100644 index 0000000..43677ec Binary files /dev/null and b/src/content/en/blog/2020/10/05/cargo2nix-demo.tar.gz differ diff --git a/src/content/en/blog/2020/10/05/cargo2nix.adoc b/src/content/en/blog/2020/10/05/cargo2nix.adoc new file mode 100644 index 0000000..a2d478e --- /dev/null +++ b/src/content/en/blog/2020/10/05/cargo2nix.adoc @@ -0,0 +1,72 @@ += cargo2nix: Dramatically simpler Rust in Nix +:sort: 1 + +:empty: +:swift2nix: link:swift2nix.html +:cargo2nix: link:cargo2nix-demo.tar.gz + +In the same vein of my earlier post on {swift2nix}[swift2nix], I was able to +quickly prototype a Rust and Cargo variation of it: {cargo2nix}[cargo2nix]. + +The initial prototype is even smaller than swift2nix: it has only 37 lines of +code. + +Here's how to use it (snippet taken from the repo's README): + +[source,nix] +---- +let + niv-sources = import ./nix/sources.nix; + mozilla-overlay = import niv-sources.nixpkgs-mozilla; + pkgs = import niv-sources.nixpkgs { overlays = [ mozilla-overlay ]; }; + src = pkgs.nix-gitignore.gitignoreSource [ ] ./.; + cargo2nix = pkgs.callPackage niv-sources.cargo2nix { + lockfile = ./Cargo.lock; + }; +in pkgs.stdenv.mkDerivation { + inherit src; + name = "cargo-test"; + buildInputs = [ pkgs.latest.rustChannels.nightly.rust ]; + phases = [ "unpackPhase" "buildPhase" ]; + buildPhase = '' + # Setup dependencies path to satisfy Cargo + mkdir .cargo/ + ln -s ${cargo2nix.env.cargo-config} .cargo/config + ln -s ${cargo2nix.env.vendor} vendor + + # Run the tests + cargo test + touch $out + ''; +} +---- + +That `cargo test` part on line 20 is what I have been fighting with every +"*2nix" available for Rust out there. I don't want to bash any of them. All I +want is to have full control of what Cargo commands to run, and the "*2nix" tool +should only setup the environment for me. Let me drive Cargo myself, no need to +parameterize how the tool runs it for me, or even replicate its internal +behaviour by calling the Rust compiler directly. + +Sure it doesn't support private registries or Git dependencies, but how much +bigger does it has to be to support them? Also, it doesn't support those *yet*, +there's no reason it can't be extended. I just haven't needed it yet, so I +haven't added. Patches welcome. + +The layout of the `vendor/` directory is more explicit and public then what +swift2nix does: it is whatever the command `cargo vendor` returns. However I +haven't checked if the shape of the `.cargo-checksum.json` is specified, or +internal to Cargo. + +Try out the demo (also taken from the repo's README): + +[source,sh] +---- +pushd "$(mktemp -d)" +wget -O- https://euandre.org/static/attachments/cargo2nix-demo.tar.gz | + tar -xv +cd cargo2nix-demo/ +nix-build +---- + +Report back if you wish. diff --git a/src/content/en/blog/2020/10/05/cargo2nix.tar.gz b/src/content/en/blog/2020/10/05/cargo2nix.tar.gz new file mode 100644 index 0000000..d7224d9 Binary files /dev/null and b/src/content/en/blog/2020/10/05/cargo2nix.tar.gz differ diff --git a/src/content/en/blog/2020/10/05/swift2nix-demo.tar.gz b/src/content/en/blog/2020/10/05/swift2nix-demo.tar.gz new file mode 100644 index 0000000..cc8b4f1 Binary files /dev/null and b/src/content/en/blog/2020/10/05/swift2nix-demo.tar.gz differ diff --git a/src/content/en/blog/2020/10/05/swift2nix.adoc b/src/content/en/blog/2020/10/05/swift2nix.adoc new file mode 100644 index 0000000..9a3c6fe --- /dev/null +++ b/src/content/en/blog/2020/10/05/swift2nix.adoc @@ -0,0 +1,194 @@ += swift2nix: Run Swift inside Nix builds +:sort: 0 + +:empty: +:nix: https://nixos.org/ +:swift2nix: link:swift2nix.tar.gz + +While working on a Swift project, I didn't find any tool that would allow Swift +to run inside {nix}[Nix] builds. Even thought you _can_ run Swift, the real +problem arises when using the package manager. It has many of the same problems +that other package managers have when trying to integrate with Nix, more on this +below. + +I wrote a simple little tool called {swift2nix}[swift2nix] that allows you trick +Swift's package manager into assuming everything is set up. Here's the example +from swift2nix's README file: + +[source,nix] +---- +let + niv-sources = import ./nix/sources.nix; + pkgs = import niv-sources.nixpkgs { }; + src = pkgs.nix-gitignore.gitignoreSource [ ] ./.; + swift2nix = pkgs.callPackage niv-sources.swift2nix { + package-resolved = ./Package.resolved; + }; +in pkgs.stdenv.mkDerivation { + inherit src; + name = "swift-test"; + buildInputs = with pkgs; [ swift ]; + phases = [ "unpackPhase" "buildPhase" ]; + buildPhase = '' + # Setup dependencies path to satisfy SwiftPM + mkdir .build + ln -s ${swift2nix.env.dependencies-state-json} .build/dependencies-state.json + ln -s ${swift2nix.env.checkouts} .build/checkouts + + # Run the tests + swift test + touch $out + ''; +} +---- + +The key parts are lines 15~17: we just fake enough files inside `.build/` that +Swift believes it has already downloaded and checked-out all dependencies, and +just moves on to building them. + +I've worked on it just enough to make it usable for myself, so beware of +unimplemented cases. + +== Design + +What swift2nix does is just provide you with the bare minimum that Swift +requires, and readily get out of the way: + +. I explicitly did not want to generated a `Package.nix` file, since + `Package.resolved` already exists and contains the required information; +. I didn't want to have an "easy" interface right out of the gate, after + fighting with "*2nix" tools that focus too much on that. + +The final actual code was so small (46 lines) that it made me think about +package managers, "*2nix" tools and some problems with many of them. + +== Problems with package managers + +I'm going to talk about solely language package managers. Think npm and cargo, +not apt-get. + +Package managers want to do too much, or assume too much, or just want to take +control of the entire build of the dependencies. + +This is a recurrent problem in package managers, but I don't see it as an +intrinsic one. There's nothing about a "package manager" that prevents it from +_declaring_ what it expects to encounter and in which format. The _declaring_ +part is important: it should be data, not code, otherwise you're back in the +same problem, just like lockfiles are just data. Those work in any language, +and tools can cooperate happily. + +There's no need for this declarative expectation to be standardized, or be made +compatible across languages. That would lead to a poor format that no package +manager really likes. Instead, If every package manager could say out loud what +it wants to see exactly, than more tools like swift2nix could exist, and they +would be more reliable. + +This could even work fully offline, and be simply a mapping from the lockfile +(the `Package.resolved` in Swift's case) to the filesystem representation. For +Swift, the `.build/dependencies-state.json` comes very close, but it is internal +to the package manager. + +Even though this pain only exists when trying to use Swift inside Nix, it sheds +light into this common implicit coupling that package managers have. They +usually have fuzzy boundaries and tight coupling between: + +. resolving the dependency tree and using some heuristic to pick a package + version; +. generating a lockfile with the exact pinned versions; +. downloading the dependencies present on the lockfile into some local cache; +. arranging the dependencies from the cache in a meaningful way for itself + inside the project; +. work using the dependencies while _assuming_ that step 4 was done. + +When you run `npm install` in a repository with no lockfile, it does 1~4. If +you do the same with `cargo build`, it does 1~5. That's too much: many of those +assumptions are implicit and internal to the package manager, and if you ever +need to rearrange them, you're on your own. Even though you can perform some of +those steps, you can't compose or rearrange them. + +Instead a much saner approach could be: + +. this stays the same; +. this also stays the same; +. be able to generate some JSON/TOML/edn which represents the local expected + filesystem layout with dependencies (i.e. exposing what the package manager + expects to find), let's call it `local-registry.json`; +. if a `local-registry.json` was provided, do a build using that. Otherwise + generate its own, by downloading the dependencies, arranging them, _etc._ + +The point is just making what the package manager requires visible to the +outside world via some declarative data. If this data wasn't provided, it can +move on to doing its own automatic things. + +By making the expectation explicit and public, one can plug tools _à la carte_ +if desired, but doesn't prevent the default code path of doing things the exact +same way they are now. + +== Problems with "*2nix" tools + +:node2nix: https://github.com/svanderburg/node2nix + +I have to admit: I'm unhappy with most of they. + +They conflate "using Nix" with "replicating every command of the package manager +inside Nix". + +The avoidance of an "easy" interface that I mentioned above comes from me +fighting with some of the "*2nix" tools much like I have to fight with package +managers: I don't want to offload all build responsibilities to the "*2nix" +tool, I just want to let it download some of the dependencies and get out of the +way. I want to stick with `npm test` or `cargo build`, and Nix should only +provide the environment. + +This is something that {node2nix}[node2nix] does right. It allows you to build +the Node.js environment to satisfy NPM, and you can keep using NPM for +everything else: + +[source,sh] +---- +ln -s ${node2nix-package.shell.nodeDependencies}/lib/node_modules ./node_modules +npm test +---- + +Its natural to want to put as much things into Nix as possible to benefit from +Nix's advantages. Isn't that how NixOS itself was born? + +But a "*2nix" tool should leverage Nix, not be coupled with it. The above +example lets you run any arbitrary NPM command while profiting from isolation +and reproducibility that Nix provides. It is even less brittle: any changes to +how NPM runs some things will be future-compatible, since node2nix isn't trying +to replicate what NPM does, or fiddling with NPM's internal. + +**A "*2nix" tool should build the environment, preferably from the lockfile +directly and offload everything else to the package manager**. The rest is just +nice-to-have. + +swift2nix itself could provide an "easy" interface, something that allows you to +write: + +[source,sh] +---- +nix-build -A swift2nix.release +nix-build -A swift2nix.test +---- + +The implementation of those would be obvious: create a new +`pkgs.stdenv.mkDerivation` and call `swift build -c release` and `swift test` +while using `swift2nix.env` under the hood. + +== Conclusion + +Package managers should provide exact dependencies via a data representation, +i.e. lockfiles, and expose via another data representation how they expect those +dependencies to appear on the filesystem, i.e. `local-registry.json`. This +allows package managers to provide an API so that external tools can create +mirrors, offline builds, other registries, isolated builds, _etc._ + +"*2nix" tools should build simple functions that leverage that +`local-registry.json`{empty}footnote:local-registry[ + This `local-registry.json` file doesn't have to be checked-in the repository + at all. It could be always generated on the fly, much like how Swift's + `dependencies-state.json` is. +] data and offload all the rest back to the package manager itself. This allows +the "*2nix" to not keep chasing the package manager evolution, always trying to +duplicate its behaviour. diff --git a/src/content/en/blog/2020/10/05/swift2nix.tar.gz b/src/content/en/blog/2020/10/05/swift2nix.tar.gz new file mode 100644 index 0000000..a22aaa0 Binary files /dev/null and b/src/content/en/blog/2020/10/05/swift2nix.tar.gz differ diff --git a/src/content/en/blog/2020/10/19/feature-flags.adoc b/src/content/en/blog/2020/10/19/feature-flags.adoc new file mode 100644 index 0000000..972f693 --- /dev/null +++ b/src/content/en/blog/2020/10/19/feature-flags.adoc @@ -0,0 +1,306 @@ += Feature flags: differences between backend, frontend and mobile +:categories: presentation +:updatedat: 2020-11-03 + +:empty: +:slides: link:../../../../slides/2020/10/19/feature-flags.html FIXME +:fowler-article: https://martinfowler.com/articles/feature-toggles.html + +_This article is derived from a {slides}[presentation] on the same subject._ + +When discussing about feature flags, I find that their costs and benefits are +often well exposed and addressed. Online articles like +"{fowler-article}[Feature Toggle (aka Feature Flags)]" do a great job of +explaining them in detail, giving great general guidance of how to apply +techniques to adopt it. + +However the weight of those costs and benefits apply differently on backend, +frontend or mobile, and those differences aren't covered. In fact, many of them +stop making sense, or the decision of adopting a feature flag or not may change +depending on the environment. + +In this article I try to make the distinction between environments and how +feature flags apply to them, with some final best practices I've acquired when +using them in production. + +== Why feature flags + +:atlassian-cicd: https://www.atlassian.com/continuous-delivery/principles/continuous-integration-vs-delivery-vs-deployment + +Feature flags in general tend to be cited on the context of +{atlassian-cicd}[continuous deployment]: + +____ +A: With continuous deployment, you deploy to production automatically + +B: But how do I handle deployment failures, partial features, _etc._? + +A: With techniques like canary, monitoring and alarms, feature flags, _etc._ +____ + +Though adopting continuous deployment doesn't force you to use feature flags, it +creates a demand for it. The inverse is also true: using feature flags on the +code points you more obviously to continuous deployment. Take the following +code sample for example, that we will reference later on the article: + +[source,javascript] +---- +function processTransaction() { + validate(); + persist(); + // TODO: add call to notifyListeners() +} +---- + +While being developed, being tested for suitability or something similar, +`notifyListeners()` may not be included in the code at once. So instead of +keeping it on a separate, long-lived branch, a feature flag can decide when the +new, partially implemented function will be called: + +[source,javascript] +---- +function processTransaction() { + validate(); + persist(); + if (featureIsEnabled("activate-notify-listeners")) { + notifyListeners(); + } +} +---- + +This allows your code to include `notifyListeners()`, and decide when to call it +at runtime. For the price of extra things around the code, you get more +dynamicity. + +So the fundamental question to ask yourself when considering adding a feature +flag should be: + +____ +Am I willing to pay with code complexity to get dynamicity? +____ + +It is true that you can make the management of feature flags as straightforward +as possible, but having no feature flags is simpler than having any. What you +get in return is the ability to parameterize the behaviour of the application at +runtime, without doing any code changes. + +Sometimes this added complexity may tilt the balance towards not using a feature +flag, and sometimes the flexibility of changing behaviour at runtime is +absolutely worth the added complexity. This can vary a lot by code base, +feature, but fundamentally by environment: its much cheaper to deploy a new +version of a service than to release a new version of an app. + +So the question of which environment is being targeted is key when reasoning +about costs and benefits of feature flags. + +== Control over the environment + +:fdroid: https://f-droid.org/ +:bad-apple: https://www.paulgraham.com/apple.html + +The key differentiator that makes the trade-offs apply differently is how much +control you have over the environment. + +When running a *backend* service, you usually are paying for the servers +themselves, and can tweak them as you wish. This means you have full control do +to code changes as you wish. Not only that, you decide when to do it, and for +how long the transition will last. + +On the *frontend* you have less control: even though you can choose to make a +new version available any time you wish, you can't +force{empy}footnote:force[ + Technically you could force a reload with JavaScript using + `window.location.reload()`, but that not only is invasive and impolite, but + also gives you the illusion that you have control over the client when you + actually don't: clients with disabled JavaScript would be immune to such + tactics. +] clients to immediately switch to the new version. That means that a) clients +could skip upgrades at any time and b) you always have to keep backward and +forward compatibility in mind. + +Even though I'm mentioning frontend directly, it applies to other environment +with similar characteristics: desktop applications, command-line programs, +_etc_. + +On *mobile* you have even less control: app stores need to allow your app to be +updated, which could bite you when least desired. Theoretically you could make +you APK available on third party stores like {fdroid}[F-Droid], or even make the +APK itself available for direct download, which would give you the same +characteristics of a frontend application, but that happens less often. + +On iOS you can't even do that. You have to get Apple's blessing on every single +update. Even though we already know that is a {bad-apple}[bad idea] for over a +decade now, there isn't a way around it. This is where you have the least +control. + +In practice, the amount of control you have will change how much you value +dynamicity: the less control you have, the more valuable it is. In other words, +having a dynamic flag on the backend may or may not be worth it since you could +always update the code immediately after, but on iOS it is basically always +worth it. + +== Rollout + +:kubernetes-deployment: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#creating-a-deployment +:play-store-rollout: https://support.google.com/googleplay/android-developer/answer/6346149?hl=en +:app-store-rolllout: https://help.apple.com/app-store-connect/#/dev3d65fcee1 + +A rollout is used to _roll out_ a new version of software. + +They are usually short-lived, being relevant as long as the new code is being +deployed. The most common rule is percentages. + +On the *backend*, it is common to find it on the deployment infrastructure +itself, like canary servers, blue/green deployments, {kubernetes-deployment}[a +kubernetes deployment rollout], _etc_. You could do those manually, by having a +dynamic control on the code itself, but rollbacks are cheap enough that people +usually do a normal deployment and just give some extra attention to the metrics +dashboard. + +Any time you see a blue/green deployment, there is a rollout happening: most +likely a load balancer is starting to direct traffic to the new server, until +reaching 100% of the traffic. Effectively, that is a rollout. + +On the *frontend*, you can selectively pick which user's will be able to +download the new version of a page. You could use geographical region, IP, +cookie or something similar to make this decision. + +CDN propagation delays and people not refreshing their web pages are also +rollouts by themselves, since old and new versions of the software will coexist. + +On *mobile*, the Play Store allows you to perform fine-grained +{play-store-rollout}[staged rollouts], and the App Store allows you to perform +limited {app-store-rollout}[phased releases]. + +Both for Android and iOS, the user plays the role of making the download. + +In summary: since you control the servers on the backend, you can do rollouts at +will, and those are often found automated away in base infrastructure. On the +frontend and on mobile, there are ways to make new versions available, but users +may not download them immediately, and many different versions of the software +end up coexisting. + +== Feature flag + +A feature flag is a _flag_ that tells the application on runtime to turn on or +off a given _feature_. That means that the actual production code will have +more than one possible code paths to go through, and that a new version of a +feature coexists with the old version. The feature flag tells which part of the +code to go through. + +They are usually medium-lived, being relevant as long as the new code is being +developed. The most common rules are percentages, allow/deny lists, A/B groups +and client version. + +On the *backend*, those are useful for things that have a long development +cycle, or that needs to done by steps. Consider loading the feature flag rules +in memory when the application starts, so that you avoid querying a database or +an external service for applying a feature flag rule and avoid flakiness on the +result due to intermittent network failures. + +Since on the *frontend* you don't control when to update the client software, +you're left with applying the feature flag rule on the server, and exposing the +value through an API for maximum dynamicity. This could be in the frontend code +itself, and fallback to a "just refresh the page"/"just update to the latest +version" strategy for less dynamic scenarios. + +On *mobile* you can't even rely on a "just update to the latest version" +strategy, since the code for the app could be updated to a new feature and be +blocked on the store. Those cases aren't recurrent, but you should always +assume the store will deny updates on critical moments so you don't find +yourself with no cards to play. That means the only control you actually have +is via the backend, by parameterizing the runtime of the application using the +API. In practice, you should always have a feature flag to control any relevant +piece of code. There is no such thing as "too small code change for a feature +flag". What you should ask yourself is: + +____ +If the code I'm writing breaks and stays broken for around a month, do I care? +____ + +If you're doing an experimental screen, or something that will have a very small +impact you might answer "no" to the above question. For everything else, the +answer will be "yes": bug fixes, layout changes, refactoring, new screen, +filesystem/database changes, _etc_. + +== Experiment + +An experiment is a feature flag where you care about analytical value of the +flag, and how it might impact user's behaviour. A feature flag with analytics. + +They are also usually medium-lived, being relevant as long as the new code is +being developed. The most common rule is A/B test. + +On the *backend*, an experiment rely on an analytical environment that will pick +the A/B test groups and distributions, which means those can't be held in memory +easily. That also means that you'll need a fallback value in case fetching the +group for a given customer fails. + +On the *frontend* and on *mobile* they are no different from feature flags. + +== Operational toggle + +An operational toggle is like a system-level manual circuit breaker, where you +turn on/off a feature, fail over the load to a different server, _etc_. They +are useful switches to have during an incident. + +They are usually long-lived, being relevant as long as the code is in +production. The most common rule is percentages. + +They can be feature flags that are promoted to operational toggles on the +*backend*, or may be purposefully put in place preventively or after a +postmortem analysis. + +On the *frontend* and on *mobile* they are similar to feature flags, where the +"feature" is being turned on and off, and the client interprets this value to +show if the "feature" is available or unavailable. + +== Best practices + +=== Prefer dynamic content + +Even though feature flags give you more dynamicity, they're still somewhat +manual: you have to create one for a specific feature and change it by hand. + +If you find yourself manually updating a feature flags every other day, or +tweaking the percentages frequently, consider making it fully dynamic. Try +using a dataset that is generated automatically, or computing the content on the +fly. + +Say you have a configuration screen with a list of options and sub-options, and +you're trying to find how to better structure this list. Instead of using a +feature flag for switching between 3 and 5 options, make it fully dynamic. This +way you'll be able to perform other tests that you didn't plan, and get more +flexibility out of it. + +=== Use the client version to negotiate feature flags + +After effectively finishing a feature, the old code that coexisted with the new +one will be deleted, and all traces of the transition will vanish from the code +base. However if you just remove the feature flags from the API, all of the old +versions of clients that relied on that value to show the new feature will go +downgrade to the old feature. + +This means that you should avoid deleting client-facing feature flags, and +retire them instead: use the client version to decide when the feature is +stable, and return `true` for every client with a version greater or equal to +that. This way you can stop thinking about the feature flag, and you don't +break or downgrade clients that didn't upgrade past the transition. + +=== Beware of many nested feature flags + +Nested flags combine exponentially. + +Pick strategic entry points or transitions eligible for feature flags, and +beware of their nesting. + +=== Include feature flags in the development workflow + +Add feature flags to the list of things to think about during whiteboarding, and +deleting/retiring a feature flags at the end of the development. + +=== Always rely on a feature flag on the app + +Again, there is no such thing "too small for a feature flag". Too many feature +flags is a good problem to have, not the opposite. Automate the process of +creating a feature flag to lower its cost. diff --git a/src/content/en/blog/2020/10/20/wrong-interviewing.adoc b/src/content/en/blog/2020/10/20/wrong-interviewing.adoc new file mode 100644 index 0000000..4b8d855 --- /dev/null +++ b/src/content/en/blog/2020/10/20/wrong-interviewing.adoc @@ -0,0 +1,340 @@ += How not to interview engineers +:updatedat: 2020-10-24 + +:bad-article: https://defmacro.substack.com/p/how-to-interview-engineers +:satire-comment: https://defmacro.substack.com/p/how-to-interview-engineers/comments#comment-599996 +:double-down: https://twitter.com/spakhm/status/1315754730740617216 +:poes-law: https://en.wikipedia.org/wiki/Poe%27s_law +:hn-comment-1: https://news.ycombinator.com/item?id=24757511 + +This is a response to Slava's "{bad-article}[How to interview engineers]" +article. I initially thought it was a satire, {satire-comment}[as have others], +but he has [doubled down on it]: + +____ +(...) Some parts are slightly exaggerated for sure, but the essay isn't meant as +a joke. +____ + +That being true, he completely misses the point on how to improve hiring, and +proposes a worse alternative on many aspects. It doesn't qualify as +provocative, it is just wrong. + +I was comfortable taking it as a satire, and I would just ignore the whole thing +if it wasn't (except for the technical memo part), but friends of mine +considered it to be somewhat reasonable. This is a adapted version of parts of +the discussions we had, risking becoming a gigantic showcase of {poes-law}[Poe's +law]. + +In this piece, I will argument against his view, and propose an alternative +approach to improve hiring. + +It is common to find people saying how broken technical hiring is, as well put +in words by a phrase on {hn-comment-1}[this comment]: + +____ +Everyone loves to read and write about how developer interviewing is flawed, but +no one wants to go out on a limb and make suggestions about how to improve it. +____ + +I guess Slava was trying to not fall on this trap, and make a suggestion on how +to improve instead, which all went terribly wrong. + +== What not to do + +=== Time candidates + +:hammock-driven-talk: https://www.youtube.com/watch?v=f84n5oFoZBc + +Timing the candidate shows up on the "talent" and "judgment" sections, and they +are both bad ideas for the same reason: programming is not a performance. + +What do e-sports, musicians, actors and athletes have in common: performance +psychologists. + +For a pianist, their state of mind during concerts is crucial: they not only +must be able to deal with stage anxiety, but to become really successful they +will have to learn how to exploit it. The time window of the concert is what +people practice thousands of hours for, and it is what defines one's career, +since how well all the practice went is irrelevant to the nature of the +profession. Being able to leverage stage anxiety is an actual goal of them. + +That is also applicable to athletes, where the execution during a competition +makes them sink or swim, regardless of how all the training was. + +The same cannot be said about composers, though. They are more like book +writers, where the value is not on very few moments with high adrenaline, but on +the aggregate over hours, days, weeks, months and years. A composer may have a +deadline to finish a song in five weeks, but it doesn't really matter if it is +done on a single night, every morning between 6 and 9, at the very last week, or +any other way. No rigid time structure applies, only whatever fits best to the +composer. + +Programming is more like composing than doing a concert, which is another way of +saying that programming is not a performance. People don't practice algorithms +for months to keep them at their fingertips, so that finally in a single +afternoon they can sit down and write everything at once in a rigid 4 hours +window, and launch it immediately after. + +Instead software is built iteratively, by making small additions, than +refactoring the implementation, fixing bugs, writing a lot at once, _etc_. all +while they get a firmer grasp of the problem, stop to think about it, come up +with new ideas, _etc_. + +Some specifically plan for including spaced pauses, and call it +"{hammock-driven-talk}[Hammock Driven Development]", which is just artist's +"creative idleness" for hackers. + +Unless you're hiring for a live coding group, a competitive programming team, or +a professional live demoer, timing the candidate that way is more harmful than +useful. This type of timing doesn't find good programmers, it finds performant +programmers, which isn't the same thing, and you'll end up with people who can +do great work on small problems but who might be unable to deal with big +problems, and loose those who can very well handle huge problems, slowly. If +you are lucky you'll get performant people who can also handle big problems on +the long term, but maybe not. + +An incident is the closest to a "performance" that it gets, and yet it is still +dramatically different. Surely it is a high stress scenario, but while people +are trying to find a root cause and solve the problem, only the downtime itself +is visible to the exterior. It is like being part of the support staff +backstage during a play: even though execution matters, you're still not on the +spot. During an incident you're doing debugging in anger rather than live +coding. + +Although giving a candidate the task to write a "technical memo" has potential +to get a measure of the written communication skills of someone, doing so in a +hard time window also misses the point for the same reasons. + +=== Pay attention to typing speed + +:dijkstra-typing: https://www.cs.utexas.edu/users/EWD/transcriptions/EWD05xx/EWD512.html +:speech-to-text: https://www.youtube.com/watch?v=Mz3JeYfBTcY +:j-lang: https://www.jsoftware.com/#/ + +Typing is speed in never the bottleneck of a programmer, no matter how great +they are. + +As {dijkstra-typing}[Dijkstra said]: + +____ +But programming, when stripped of all its circumstantial irrelevancies, boils +down to no more and no less than very effective thinking so as to avoid +unmastered complexity, to very vigorous separation of your many different +concerns. +____ + +In other words, programming is not about typing, it is about thinking. + +Otherwise, the way to get those star programmers that can't type fast enough a +huge productivity boost is to give them a touch typing course. If they are so +productive with typing speed being a limitation, imagine what they could +accomplish if they had razor sharp touch typing skills? + +Also, why stop there? A good touch typist can do 90 WPM (words per minute), and +a great one can do 120 WPM, but with a stenography keyboard they get to 200 +WPM+. That is double the productivity! Why not try +{speech-to-text}[speech-to-text]? Make them all use {j-lang}[J] so they all +need to type less! How come nobody thought of that? + +And if someone couldn't solve the programming puzzle in the given time window, +but could come back in the following day with an implementation that is not only +faster, but uses less memory, was simpler to understand and easier to read than +anybody else? You'd be losing that person too. + +=== IQ + +:determination-article: https://www.paulgraham.com/determination.html +:scihub-article: https://sci-hub.do/https://psycnet.apa.org/doiLanding?doi=10.1037%2F1076-8971.6.1.33 + +For "building an extraordinary team at a hard technology startup", +intelligence is not the most important, +{determination-article}[determination is]. + +And talent isn't "IQ specialized for engineers". IQ itself isn't a measure of +how intelligent someone is. Ever since Alfred Binet with Théodore Simon started +to formalize what would become IQ tests years later, they already acknowledged +limitations of the technique for measuring intelligence, which is +{scihub-article}[still true today]. + +So having a high IQ tells only how smart people are for a particular aspect of +intelligence, which is not representative of programming. There are numerous +aspects of programming that are covered by IQ measurement: how to name variables +and functions, how to create models which are compatible with schema evolution, +how to make the system dynamic for runtime parameterization without making it +fragile, how to measure and observe performance and availability, how to pick +between acquiring and paying technical debt, _etc_. + +Not to say about everything else that a programmer does that is not purely +programming. Saying high IQ correlates with great programming is a stretch, at +best. + +=== Ditch HR + +Slava tangentially picks on HR, and I will digress on that a bit: + +____ +A good rule of thumb is that if a question could be asked by an intern in HR, +it's a non-differential signaling question. +____ + +Stretching it, this is a rather snobbish view of HR. Why is it that an intern +in HR can't make signaling questions? Could the same be said of an intern in +engineering? + +In other words: is the question not signaling because the one asking is from HR, +or because the one asking is an intern? If the latter, than he's just arguing +that interns have no place in interviewing, but if the former than he was +picking on HR. + +Extrapolating that, it is common to find people who don't value HR's work, and +only see them as inferiors doing unpleasant work, and who aren't capable enough +(or _smart_ enough) to learn programming. + +This is equivalent to people who work primarily on backend, and see others +working on frontend struggling and say: "isn't it just building views and +showing them on the browser? How could it possibly be that hard? I bet I could +do it better, with 20% of code". As you already know, the answer to it is +"well, why don't you go do it, then?". + +This sense of superiority ignores the fact that HR have actual professionals +doing actual hard work, not unlike programmers. If HR is inferior and so easy, +why not automate everything away and get rid of a whole department? + +I don't attribute this world view to Slava, this is only an extrapolation of a +snippet of the article. + +=== Draconian mistreating of candidates + +:bad-apple: https://www.paulgraham.com/apple.html +:be-good: https://www.paulgraham.com/good.html + +If I found out that people employed theatrics in my interview so that I could +feel I've "earned the privilege to work at your company", I would quit. + +If your moral compass is so broken that you are comfortable mistreating me while +I'm a candidate, I immediately assume you will also mistreat me as an employee, +and that the company is not a good place to work, as {bad-apple}[evil begets +stupidity]: + +____ +But the other reason programmers are fussy, I think, is that evil begets +stupidity. An organization that wins by exercising power starts to lose the +ability to win by doing better work. And it's not fun for a smart person to +work in a place where the best ideas aren't the ones that win. I think the +reason Google embraced "Don't be evil" so eagerly was not so much to impress the +outside world as to inoculate themselves against arrogance. +____ + +Paul Graham goes beyond "don't be evil" with a better motto: +"{be-good}[be good]". + +Abusing the asymmetric nature of an interview to increase the chance that the +candidate will accept the offer is, well, abusive. I doubt a solid team can +actually be built on such poor foundations, surrounded by such evil measures. + +And if you really want to give engineers "the measure of whoever they're going +to be working with", there are plenty of reasonable ways of doing it that don't +include performing fake interviews. + +=== Personality tests + +Personality tests around the world need to be a) translated, b) adapted and c) +validated. Even though a given test may be applicable and useful in a country, +this doesn't imply it will work for other countries. + +Not only tests usually come with translation guidelines, but also its +applicability needs to be validated again after the translation and adaptation +is done to see if the test still measures what it is supposed to. + +That is also true within the same language. If a test is shown to work in +England, it may not work in New Zealand, in spite of both speaking english. The +cultural context difference is influent to the point of invalidating a test and +making it be no longer valid. + +Irregardless of the validity of the proposed "big five" personality test, saying +"just use attributes x, y and z this test and you'll be fine" is a rough +simplification, much like saying "just use Raft for distributed systems, after +all it has been proven to work" shows he throws all of that background away. + +So much as applying personality tests themselves is not a trivial task, and +psychologists do need special training to become able to effectively apply one. + +=== More cargo culting + +:cult: https://calteches.library.caltech.edu/51/2/CargoCult.htm +:cult-archived: https://web.archive.org/web/20201003090303/https://calteches.library.caltech.edu/51/2/CargoCult.htm + +He calls the ill-defined "industry standard" to be cargo-culting, but his +proposal isn't sound enough to not become one. + +Even if the ideas were good, they aren't solid enough, or based on solid enough +things to make them stand out by themselves. Why is it that talent, judgment +and personality are required to determine the fitness of a good candidate? Why +not 2, 5, or 20 things? Why those specific 3? Why is talent defined like that? +Is it just because he found talent to be like that? + +Isn't that definitionally also +{cult}[cargo-culting]footnote:cargo-cult[ + {cult-archived}[Archived version]. +]? Isn't he just repeating whatever he found to work form him, without +understanding why? + +What Feynman proposes is actually the opposite: + +____ +In summary, the idea is to try to give *all* of the information to help others +to judge the value of your contribution; not just the information that leads to +judgment in one particular direction or another. +____ + +What Slava did was just another form of cargo culting, but this was one that he +believed to work. + +== What to do + +I will not give you a list of things that "worked for me, thus they are +correct". I won't either critique the current "industry standard", nor what +I've learned from interviewing engineers. + +Instead, I'd like to invite you to learn from history, and from what other +professionals have to teach us. + +Programming isn't an odd profession, where everything about it is different from +anything else. It is just another episode in the "technology" series, which has +seasons since before recorded history. It may be an episode where things move a +bit faster, but it is fundamentally the same. + +So here is the key idea: what people did _before_ software engineering? + +What hiring is like for engineers in other areas? Don't civil, electrical and +other types of engineering exist for much, much longer than software engineering +does? What have those centuries of accumulated experience thought the world +about technical hiring? + +What studies were performed on the different success rate of interviewing +strategies? What have they done right and what have they done wrong? + +What is the purpose of HR? Why do they even exist? Do we need them, and if so, +what for? What is the value they bring, since everybody insist on building an +HR department in their companies? Is the existence of HR another form of cargo +culting? + +What is industrial and organizational psychology? What is that field of study? +What do they specialize in? What have they learned since the discipline +appeared? What have they done right and wrong over history? Is is the current +academic consensus on that area? What is a hot debate topic in academia on that +area? What is the current bleeding edge of research? What can they teach us +about hiring? What can they teach us about technical hiring? + +== Conclusion + +If all I've said makes me a "no hire" in the proposed framework, I'm really +glad. + +This says less about my programming skills, and more about the employer's world +view, and I hope not to be fooled into applying for a company that adopts this +one. + +Claiming to be selecting "extraordinary engineers" isn't an excuse to reinvent +the wheel, poorly. diff --git a/src/content/en/blog/2020/11/07/diy-bugs.adoc b/src/content/en/blog/2020/11/07/diy-bugs.adoc new file mode 100644 index 0000000..8ab7953 --- /dev/null +++ b/src/content/en/blog/2020/11/07/diy-bugs.adoc @@ -0,0 +1,93 @@ += DIY an offline bug tracker with text files, Git and email +:updatedat: 2021-08-14 + +:attack-on-ytdl: https://github.com/github/dmca/blob/master/2020/10/2020-10-23-RIAA.md +:list-discussions: https://sourcehut.org/blog/2020-10-29-how-mailing-lists-prevent-censorship/ +:docs-in-repo: https://podcast.writethedocs.org/2017/01/25/episode-3-trends/ +:ci-in-notes: link:../../../../tils/2020/11/30/git-notes-ci.html +:todos-mui: https://man.sr.ht/todo.sr.ht/#email-access +:git-bug-bridges: https://github.com/MichaelMure/git-bug#bridges + +When {attack-on-ytdl}[push comes to shove], the operational aspects of +governance of a software project matter a lot. And everybody likes to chime in +with their alternative of how to avoid single points of failure in project +governance, just like I'm doing right now. + +The most valuable assets of a project are: + +. source code +. discussions +. documentation +. builds +. tasks and bugs + +For *source code*, Git and other DVCS solve that already: everybody gets a full +copy of the entire source code. + +If your code forge is compromised, moving it to a new one takes a couple of +minutes, if there isn't a secondary remote serving as mirror already. In this +case, no action is required. + +If you're having your *discussions* by email, "{list-discussions}[taking this +archive somewhere else and carrying on is effortless]". + +Besides, make sure to backup archives of past discussions so that the history is +also preserved when this migration happens. + +The *documentation* should {docs-in-repo}[live inside the repository +itself]footnote:writethedocs-in-repo[ + Described as "the ultimate marriage of the two". Starts at time 31:50. +], so that not only it gets first class treatment, but also gets distributed to +everybody too. Migrating the code to a new forge already migrates the +documentation with it. + +As long as you keep the *builds* vendor neutral, the migration should only +involve adapting how you call your `tests.sh` from the format of +`provider-1.yml` uses to the format that `provider-2.yml` accepts. It isn't +valuable to carry the build history with the project, as this data quickly +decays in value as weeks and months go by, but for simple text logs +{ci-in-notes}[using Git notes] may be just enough, and they would be replicated +with the rest of the repository. + +But for *tasks and bugs* many rely on a vendor-specific service, where +you register and manage those issues via a web browser. Some provide an +{todos-mui}[interface for interacting via email] or an API for +{git-bug-bridges[bridging local bugs with vendor-specific services]. But +they're all layers around the service, that disguises it as being a central +point of failure, which when compromised would lead to data loss. When push +comes to shove, you'd loose data. + +== Alternative: text files, Git and email + +:todos-example: https://euandre.org/git/remembering/tree/TODOs.md?id=3f727802cb73ab7aa139ca52e729fd106ea916d0 +:todos-script: https://euandre.org/git/remembering/tree/aux/workflow/TODOs.sh?id=3f727802cb73ab7aa139ca52e729fd106ea916d0 +:todos-html: https://euandreh.xyz/remembering/TODOs.html +:fossil-tickets: https://fossil-scm.org/home/doc/trunk/www/bugtheory.wiki + +Why not do the same as documentation, and move tasks and bugs into the +repository itself? + +It requires no extra tool to be installed, and fits right in the already +existing workflow for source code and documentation. + +I like to keep a {todos-example}[`TODOs.md`] file at the repository top-level, +with two relevant sections: "tasks" and "bugs". Then when building the +documentation I'll just {todos-script}[generate an HTML file from it], and +{todos-html}[publish] it alongside the static website. All that is done on the +main branch. + +Any issues discussions are done in the mailing list, and a reference to a +discussion could be added to the ticket itself later on. External contributors +can file tickets by sending a patch. + +The good thing about this solution is that it works for 99% of projects out +there. + +For the other 1%, having Fossil's "{fossil-tickets}[tickets]" could be an +alternative, but you may not want to migrate your project to Fossil to get those +niceties. + +Even though I keep a `TODOs.md` file on the main branch, you can have a `tasks` +branch with a `task-n.md` file for each task, or any other way you like. + +These tools are familiar enough that you can adjust it to fit your workflow. diff --git a/src/content/en/blog/2020/11/08/paradigm-shift-review.adoc b/src/content/en/blog/2020/11/08/paradigm-shift-review.adoc new file mode 100644 index 0000000..1110085 --- /dev/null +++ b/src/content/en/blog/2020/11/08/paradigm-shift-review.adoc @@ -0,0 +1,154 @@ += The Next Paradigm Shift in Programming - video review +:categories: video-review + +:reviewed-video: https://www.youtube.com/watch?v=6YbK8o9rZfI + +This is a review with comments of "{reviewed-video}[The Next Paradigm Shift in +Programming]", by Richard Feldman. + +This video was _strongly_ suggested to me by a colleague. I wanted to discuss +it with her, and when drafting my response I figured I could publish it publicly +instead. + +Before anything else, let me just be clear: I really like the talk, and I think +Richard is a great public speaker. I've watched several of his talks over the +years, and I feel I've followed his career at a distance, with much respect. +This isn't a piece criticizing him personally, and I agree with almost +everything he said. These are just some comments but also nitpicks on a few +topics I think he missed, or that I view differently. + +== Structured programming + +:forgotten-art-video: https://www.youtube.com/watch?v=SFv8Wm2HdNM + +The historical overview at the beginning is very good. In fact, the very video +I watched previously was about structured programming! + +Kevlin Henney on "{forgotten-art-video}[The Forgotten Art of Structured +Programming]" does a deep-dive on the topic of structured programming, and how +on his view it is still hidden in our code, when we do a `continue` or a `break` +in some ways. Even though it is less common to see an explicit `goto` in code +these days, many of the original arguments of Dijkstra against explicit `goto`s +is applicable to other constructs, too. + +This is a very mature view, and I like how he goes beyond the "don't use +`goto`s" heuristic and proposes and a much more nuanced understanding of what +"structured programming" means. + +In a few minutes, Richard is able to condense most of the significant bits of +Kevlin's talk in a didactical way. Good job. + +== OOP like a distributed system + +:joe-oop: https://www.infoq.com/interviews/johnson-armstrong-oop/ +:rich-hickey-oop: https://www.youtube.com/watch?v=ROor6_NGIWU + +Richard extrapolates Alan Kay's original vision of OOP, and he concludes that it +is more like a distributed system that how people think about OOP these days. +But he then states that this is a rather bad idea, and we shouldn't pursue it, +given that distributed systems are known to be hard. + +However, his extrapolation isn't really impossible, bad or an absurd. In fact, +it has been followed through by Erlang. Joe Armstrong used to say that +"{joe-oop}[Erlang might the only OOP language]", since it actually adopted this +paradigm. + +But Erlang is a functional language. So this "OOP as a distributed system" view +is more about designing systems in the large than programs in the small. + +There is a switch of levels in this comparison I'm making, as can be done with +any language or paradigm: you can have a functional-like system that is built +with an OOP language (like a compiler, that given the same input will produce +the same output), or an OOP-like system that is built with a functional +language (Rich Hickey calls it "{rich-hickey-oop}[OOP in the +large]"footnote:langsys[ + From 24:05 to 27:45. +]). + +So this jump from in-process paradigm to distributed paradigm is rather a big +one, and I don't think you he can argue that OOP has anything to say about +software distribution across nodes. You can still have Erlang actors that run +independently and send messages to each other without a network between them. +Any OTP application deployed on a single node effectively works like that. + +I think he went a bit too far with this extrapolation. Even though I agree it +is a logical a fair one, it isn't evidently bad as he painted. I would be fine +working with a single-node OTP application and seeing someone call it "a _real_ +OOP program". + +== First class immutability + +:immer: https://sinusoid.es/immer/ +:immutable-js: https://immutable-js.github.io/immutable-js/ + +I agree with his view of languages moving towards the functional paradigm. But +I think you can narrow down the "first-class immutability" feature he points out +as present on modern functional programming languages to "first-class immutable +data structures". + +I wouldn't categorize a language as "supporting functional programming style" +without a library for functional data structures it. By discipline you can +avoid side-effects, write pure functions as much as possible, and pass functions +as arguments around is almost every language these days, but if when changing an +element of a vector mutates things in-place, that is still not functional +programming. + +To avoid that, you end-up needing to make clones of objects to pass to a +function, using freezes or other workarounds. All those cases are when the +underlying mix of OOP and functional programming fail. + +There are some languages with third-party libraries that provide functional data +structures, like {immer}[immer] for C++, or {immutable-js}[ImmutableJS] for +JavaScript. + +But functional programming is more easily achievable in languages that have them +built-in, like Erlang, Elm and Clojure. + +== Managed side-effects + +:redux: https://redux.js.org/ +:re-frame: https://github.com/Day8/re-frame + +His proposal of adopting managed side-effects as a first-class language concept +is really intriguing. + +This is something you can achieve with a library, like {redux}[Redux] for +JavaScript or {re-frame}[re-frame] for Clojure. + +I haven't worked with a language with managed side-effects at scale, and I don't +feel this is a problem with Clojure or Erlang. But is this me finding a flaw in +his argument or not acknowledging a benefit unknown to me? This is a +provocative question I ask myself. + +Also all FP languages with managed side-effects I know are statically-typed, and +all dynamically-typed FP languages I know don't have managed side-effects baked +in. + +== What about declarative programming? + +:tarpit-article: https://curtclifton.net/papers/MoseleyMarks06a.pdf + +In "{tarpit-article}[Out of the Tar Pit]", B. Moseley and P. Marks go beyond his +view of functional programming as the basis, and name a possible "functional +relational programming" as an even better solution. They explicitly call out +some flaws in most of the modern functional programming languages, and instead +pick declarative programming as an even better starting paradigm. + +If the next paradigm shift is towards functional programming, will the following +shift be towards declarative programming? + +== Conclusion + +:simple-made-easy: https://www.infoq.com/presentations/Simple-Made-Easy/ + +Beyond all Richard said, I also hear often bring up functional programming when +talking about utilizing all cores of a computer, and how FP can help with that. + +Rich Hickey makes a great case for single-process FP on his famous talk +"{simple-made-easy}[Simple Made Easy]". + +//// +I find this conclusion too short, and it doesn't revisits the main points +presented on the body of the article. I won't rewrite it now, but it would be +an improvement to extend it to do so. +//// diff --git a/src/content/en/blog/2020/11/12/database-parsers-trees.adoc b/src/content/en/blog/2020/11/12/database-parsers-trees.adoc new file mode 100644 index 0000000..47595e8 --- /dev/null +++ b/src/content/en/blog/2020/11/12/database-parsers-trees.adoc @@ -0,0 +1,226 @@ += Durable persistent trees and parser combinators - building a database +:categories: mediator +:updatedat: 2021-02-09 + +:empty: +:db-article: link:../../08/31/database-i-wish-i-had.html + +I've received with certain frequency messages from people wanting to know if +I've made any progress on the database project {db-article}[I've written about]. + +There are a few areas where I've made progress, and here's a public post on it. + +== Proof-of-concept: DAG log + +:mediator-permalink: https://euandre.org/git/mediator/tree/src/core/clojure/src/mediator.clj?id=db4a727bc24b54b50158827b34502de21dbf8948#n1 + +The main thing I wanted to validate with a concrete implementation was the +concept of modeling a DAG on a sequence of datoms. + +The notion of a _datom_ is a rip-off from Datomic, which models data with time +aware _facts_, which come from RDF. RDF's fact is a triple of +subject-predicate-object, and Datomic's datoms add a time component to it: +subject-predicate-object-time, A.K.A. entity-attribute-value-transaction: + +[source,clojure] +---- +[[person :likes "pizza" 0 true] + [person :likes "bread" 1 true] + [person :likes "pizza" 1 false]] +---- + +The above datoms say: - at time 0, `person` like pizza; - at time 1, `person` +stopped liking pizza, and started to like bread. + +Datomic ensures total consistency of this ever growing log by having a single +writer, the transactor, that will enforce it when writing. + +In order to support disconnected clients, I needed a way to allow multiple +writers, and I chose to do it by making the log not a list, but a directed +acyclic graph (DAG): + +[source,clojure] +---- +[[person :likes "pizza" 0 true] + [0 :parent :db/root 0 true] + [person :likes "bread" 1 true] + [person :likes "pizza" 1 false] + [1 :parent 0 1 true]] +---- + +The extra datoms above add more information to build the directionality to the +log, and instead of a single consistent log, the DAG could have multiple leaves +that coexist, much like how different Git branches can have different "latest" +commits. + +In order to validate this idea, I started with a Clojure implementation. The +goal was not to write the actual final code, but to make a proof-of-concept that +would allow me to test and stretch the idea itself. + +This code {mediator-permalink}[already exists], but is yet fairly incomplete: + +:commented-code: https://euandre.org/git/mediator/tree/src/core/clojure/src/mediator.clj?id=db4a727bc24b54b50158827b34502de21dbf8948#n295 +:more: https://euandre.org/git/mediator/tree/src/core/clojure/src/mediator.clj?id=db4a727bc24b54b50158827b34502de21dbf8948#n130 +:than: https://euandre.org/git/mediator/tree/src/core/clojure/src/mediator.clj?id=db4a727bc24b54b50158827b34502de21dbf8948#n146 +:one: https://euandre.org/git/mediator/tree/src/core/clojure/src/mediator.clj?id=db4a727bc24b54b50158827b34502de21dbf8948#n253 + +* the building of the index isn't done yet (with some {commented-code}[commented + code] on the next step to be implemented) +* the indexing is extremely inefficient, with {more}[more] {than}[than] + {one}[one] occurrence of `O²` functions; +* no query support yet. + +== Top-down _and_ bottom-up + +However, as time passed and I started looking at what the final implementation +would look like, I started to consider keeping the PoC around. + +The top-down approach (Clojure PoC) was in fact helping guide me with the +bottom-up, and I now have "promoted" the Clojure PoC into a "reference +implementation". It should now be a finished implementation that says what the +expected behaviour is, and the actual code should match the behaviour. + +The good thing about a reference implementation is that it has no performance of +resources boundary, so if it ends up being 1000× slower and using 500× more +memory, it should be find. The code can be also 10× or 100× simpler, too. + +== Top-down: durable persistent trees + +:pavlo-videos: https://www.youtube.com/playlist?list=PLSE8ODhjZXjbohkNBWQs_otTrBTrjyohi +:db-book: https://www.databass.dev/ + +In promoting the PoC into a reference implementation, this top-down approach now +needs to go beyond doing everything in memory, and the index data structure now +needs to be disk-based. + +Roughly speaking, most storage engines out there are based either on B-Trees or +LSM Trees, or some variations of those. + +But when building an immutable database, update-in-place B-Trees aren't an +option, as it doesn't accommodate keeping historical views of the tree. LSM +Trees may seem a better alternative, but duplication on the files with +compaction are also ways to delete old data which is indeed useful for a +historical view. + +I think the thing I'm after is a mix of a Copy-on-Write B-Tree, which would keep +historical versions with the write IO cost amortization of memtables of LSM +Trees. I don't know of any B-Tree variant out there that resembles this, so +I'll call it "Flushing Copy-on-Write B-Tree". + +I haven't written any code for this yet, so all I have is a high-level view of +what it will look like: + +. like Copy-on-Write B-Trees, changing a leaf involves creating a new leaf and + building a new path from root to the leaf. The upside is that writes a lock + free, and no coordination is needed between readers and writers, ever; +. the downside is that a single leaf update means at least `H` new nodes that + will have to be flushed to disk, where `H` is the height of the tree. To + avoid that, the writer creates these nodes exclusively on the in-memory + memtable, to avoid flushing to disk on every leaf update; +. a background job will consolidate the memtable data every time it hits X MB, + and persist it to disk, amortizing the cost of the Copy-on-Write B-Tree; +. readers than will have the extra job of getting the latest relevant + disk-resident value and merge it with the memtable data. + +The key difference to existing Copy-on-Write B-Trees is that the new trees are +only periodically written to disk, and the intermediate values are kept in +memory. Since no node is ever updated, the page utilization is maximum as it +doesn't need to keep space for future inserts and updates. + +And the key difference to existing LSM Trees is that no compaction is run: +intermediate values are still relevant as the database grows. So this leaves +out tombstones and value duplication done for write performance. + +One can delete intermediate index values to reclaim space, but no data is lost +on the process, only old B-Tree values. And if the database ever comes back to +that point (like when doing a historical query), the B-Tree will have to be +rebuilt from a previous value. After all, the database _is_ a set of datoms, +and everything else is just derived data. + +Right now I'm still reading about other data structures that storage engines +use, and I'll start implementing the "Flushing Copy-on-Write B-Tree" as I learn +more{empty}footnote:learn-more-db[ + If you are interested in learning more about this too, the very best two + resources on this subject are Andy Pavlo's "{pavlo-videos}[Intro to Database + Systems]" course and Alex Petrov's "{db-book}[Database Internals]" book. +] and mature it more. + +== Bottom-up: parser combinators and FFI + +:cbindgen: https://github.com/eqrion/cbindgen +:cbindgen-next: https://blog.eqrion.net/future-directions-for-cbindgen/ +:syn-crate: https://github.com/dtolnay/syn +:libedn: https://euandre.org/git/libedn/ + +I chose Rust as it has the best WebAssembly tooling support. + +My goal is not to build a Rust database, but a database that happens to be in +Rust. In order to reach client platforms, the primary API is the FFI one. + +I'm not very happy with current tools for exposing Rust code via FFI to the +external world: they either mix C with C++, which I don't want to do, or +provide no access to the intermediate representation of the FFI, which would be +useful for generating binding for any language that speaks FFI. + +I like better the path that the author of {cbindgen}[cbindgen] crate +{cbindgen-next}[proposes]: emitting an data representation of the Rust C API +(the author calls is a `ffi.json` file), and than building transformers from the +data representation to the target language. This way you could generate a C API +_and_ the node-ffi bindings for JavaScript automatically from the Rust code. + +So the first thing to be done before moving on is an FFI exporter that doesn't +mix C and C++, and generates said `ffi.json`, and than build a few transformers +that take this `ffi.json` and generate the language bindings, be it C, C++, +JavaScript, TypeScript, Kotlin, Swift, Dart, +_etc_footnote:ffi-langs[ + Those are, specifically, the languages I'm more interested on. My goal is + supporting client applications, and those languages are the most relevant for + doing so: C for GTK, C++ for Qt, JavaScript and TypeScript for Node.js and + browser, Kotlin for Android and Swing, Swift for iOS, and Dart for Flutter. +]. + +I think the best way to get there is by taking the existing code for cbindgen, +which uses the {syn-crate}[syn] crate to parse the Rust +code{empty}footnote:rust-syn[ + The fact that syn is an external crate to the Rust compiler points to a big + warning: procedural macros are not first class in Rust. They are just like + Babel plugins in JavaScript land, with the extra shortcoming that there is no + specification for the Rust syntax, unlike JavaScript. +pass:[

] + As flawed as this may be, it seems to be generally acceptable and adopted, + which works against building a solid ecosystem for Rust. +pass:[

] + The alternative that rust-ffi implements relies on internals of the Rust + compiler, which isn't actually worst, just less common and less accepted. +], and adapt it to emit the metadata. + +I've started a fork of cbindgen: +[line-through]#x-bindgen#{empty}footnote:x-bindgen[ + _EDIT_: now archived, the experimentation was fun. I've started to move more + towards C, so this effort became deprecated. +]. Right now it is just a copy of cbindgen verbatim, and I plan to remove all C +and C++ emitting code from it, and add a IR emitting code instead. + +When starting working on x-bindgen, I realized I didn't know what to look for in +a header file, as I haven't written any C code in many years. So as I was +writing {libedn}[libedn], I didn't know how to build a good C API to expose. So +I tried porting the code to C, and right now I'm working on building a _good_ C +API for a JSON parser using parser combinators: +[line-through]#ParsecC#{empty}footnote:parsecc[ + _EDIT_: now also archived. +]. + +After "finishing" ParsecC I'll have a good notion of what a good C API is, and +I'll have a better direction towards how to expose code from libedn to other +languages, and work on x-bindgen then. + +What both libedn and ParsecC are missing right now are proper error reporting, +and property-based testing for libedn. + +== Conclusion + +I've learned a lot already, and I feel the journey I'm on is worth going +through. + +If any of those topics interest you, message me to discuss more or contribute! +Patches welcome! diff --git a/src/content/en/blog/2020/11/14/local-first-review.adoc b/src/content/en/blog/2020/11/14/local-first-review.adoc new file mode 100644 index 0000000..f9dd4b0 --- /dev/null +++ b/src/content/en/blog/2020/11/14/local-first-review.adoc @@ -0,0 +1,305 @@ += Local-First Software: article review +:categories: presentation article-review + +:empty: +:presentation: link:../../../../slides/2020/11/14/local-first.html FIXME +:reviewed-article: https://martin.kleppmann.com/papers/local-first.pdf + +_This article is derived from a {presentation}[presentation] given at a Papers +We Love meetup on the same subject._ + +This is a review of the article "{reviewed-article}[Local-First Software: You +Own Your Data, in spite of the Cloud]", by M. Kleppmann, A. Wiggins, P. Van +Hardenberg and M. F. McGranaghan. + +== Offline-first, local-first + +The "local-first" term they use isn't new, and I have used it myself in the past +to refer to this types of application, where the data lives primarily on the +client, and there are conflict resolution algorithms that reconcile data created +on different instances. + +Sometimes I see confusion with this idea and "client-side", "offline-friendly", +"syncable", etc. I have myself used this terms, also. + +There exists, however, already the "offline-first" term, which conveys almost +all of that meaning. In my view, "local-first" doesn't extend "offline-first" +in any aspect, rather it gives a well-defined meaning to it instead. I could +say that "local-first" is just "offline-first", but with 7 well-defined ideals +instead of community best practices. + +It is a step forward, and given the number of times I've seen the paper shared +around I think there's a chance people will prefer saying "local-first" in +_lieu_ of "offline-first" from now on. + +== Software licenses + +On a footnote of the 7th ideal ("You Retain Ultimate Ownership and Control"), +the authors say: + +____ +In our opinion, maintaining control and ownership of data does not mean that the +software must necessarily be open source. (...) as long as it does not +artificially restrict what users can do with their files. +____ + +They give examples of artificial restrictions, like this artificial restriction +I've come up with: + +[source,sh] +---- +#!/bin/sh + +TODAY=$(date +%s) +LICENSE_EXPIRATION=$(date -d 2020-11-15 +%s) + +if [ $TODAY -ge $LICENSE_EXPIRATION ]; then + echo 'License expired!' + exit 1 +fi + +echo $((2 + 2)) +---- + +Now when using this very useful program: + +[source,sh] +---- +# today +$ ./useful-adder.sh +4 +# tomorrow +$ ./useful-adder.sh +License expired! +---- + +This is obviously an intentional restriction, and it goes against the 5th ideal +("The Long Now"). This software would only be useful as long as the embedded +license expiration allowed. Sure you could change the clock on the computer, +but there are many other ways that this type of intentional restriction is in +conflict with that ideal. + +However, what about unintentional restrictions? What if a software had an equal +or similar restriction, and stopped working after days pass? Or what if the +programmer added a constant to make the development simpler, and this led to +unintentionally restricting the user? + +[source,sh] +---- +# today +$ useful-program +# ...useful output... + +# tomorrow, with more data +$ useful-program +ERROR: Panic! Stack overflow! +---- + +Just as easily as I can come up with ways to intentionally restrict users, I can +do the same for unintentionally restrictions. A program can stop working for a +variety of reasons. + +If it stops working due do, say, data growth, what are the options? Reverting +to an earlier backup, and making it read-only? That isn't really a "Long Now", +but rather a "Long Now as long as the software keeps working as expected". + +The point is: if the software isn't free, "The Long Now" isn't achievable +without a lot of wishful thinking. Maybe the authors were trying to be more +friendly towards business who don't like free software, but in doing so they've +proposed a contradiction by reconciling "The Long Now" with proprietary +software. + +It isn't the same as saying that any free software achieves that ideal, either. +The license can still be free, but the source code can become unavailable due to +cloud rot. Or maybe the build is undocumented, or the build tools had specific +configuration that one has to guess. A piece of free software can still fail to +achieve "The Long Now". Being free doesn't guarantee it, just makes it +possible. + +A colleague has challenged my view, arguing that the software doesn't really +need to be free, as long as there is an specification of the file format. This +way if the software stops working, the format can still be processed by other +programs. But this doesn't apply in practice: if you have a document that you +write to, and software stops working, you still want to write to the document. +An external tool that navigates the content and shows it to you won't allow you +to keep writing, and when it does that tool is now starting to re-implement the +software. + +An open specification could serve as a blueprint to other implementations, +making the data format more friendly to reverse-engineering. But the +re-implementation still has to exist, at which point the original software +failed to achieve "The Long Now". + +It is less bad, but still not quite there yet. + +== Denial of existing solutions + +:distgit: https://drewdevault.com/2018/07/23/Git-is-already-distributed.html + +When describing "Existing Data Storage and Sharing Models", on a +footnote{empty}footnote:devil[ + This is the second aspect that I'm picking on the article from a footnote. I + guess the devil really is on the details. +] the authors say: + +____ +In principle it is possible to collaborate without a repository service, e.g. by +sending patch files by email, but the majority of Git users rely on GitHub. +____ + +The authors go to a great length to talk about usability of cloud apps, and even +point to research they've done on it, but they've missed learning more from +local-first solutions that already exist. + +Say the automerge CRDT proves to be even more useful than what everybody +imagined. Say someone builds a local-first repository service using it. How +will it change anything of the Git/GitHub model? What is different about it +that prevents people in the future writing a paper saying: + +____ +In principle it is possible to collaborate without a repository service, e.g. by +using automerge and platform X, but the majority of Git users rely on GitHub. +____ + +How is this any better? + +If it is already {distgit}[possible] to have a local-first development workflow, +why don't people use it? Is it just fashion, or there's a fundamental problem +with it? If so, what is it, and how to avoid it? + +If sending patches by emails is perfectly possible but out of fashion, why even +talk about Git/GitHub? Isn't this a problem that people are putting themselves +in? How can CRDTs possibly prevent people from doing that? + +My impression is that the authors envision a better future, where development is +fully decentralized unlike today, and somehow CRDTs will make that happen. If +more people think this way, "CRDT" is next in line to the buzzword list that +solves everything, like "containers", "blockchain" or "machine learning". + +Rather than picturing an imaginary service that could be described like +"GitHub+CRDTs" and people would adopt it, I'd rather better understand why +people don't do it already, since Git is built to work like that. + +== Ditching of web applications + +:pouchdb: https://pouchdb.com/ +:instant-apps: https://developer.android.com/topic/google-play-instant + +The authors put web application in a worse position for building local-first +application, claiming that: + +____ +(...) the architecture of web apps remains fundamentally server-centric. +Offline support is an afterthought in most web apps, and the result is +accordingly fragile. +____ + +Well, I disagree. + +The problem isn't inherit to the web platform, but instead how people use it. + +I have myself built offline-first applications, leveraging IndexedDB, App Cache, +_etc_. I wanted to build an offline-first application on the web, and so I did. + +In fact, many people choose {pouchdb}[PouchDB] _because_ of that, since it is a +good tool for offline-first web applications. The problem isn't really the +technology, but how much people want their application to be local-first. + +Contrast it with Android {instant-apps}[Instant Apps], where applications are +sent to the phone in small parts. Since this requires an internet connection to +move from a part of the app bundle to another, a subset of the app isn't +local-first, despite being an app. + +The point isn't the technology, but how people are using it. Local-first web +applications are perfectly possible, just like non-local-first native +applications are possible. + +== Costs are underrated + +I think the costs of "old-fashioned apps" over "cloud apps" are underrated, +mainly regarding storage, and that this costs can vary a lot by application. + +Say a person writes online articles for their personal website, and puts +everything into Git. Since there isn't supposed to be any collaboration, all of +the relevant ideals of local-first are achieved. + +Now another person creates videos instead of articles. They could try keeping +everything local, but after some time the storage usage fills the entire disk. +This person's local-first setup would be much more complex, and would cost much +more on maintenance, backup and storage. + +Even though both have similar needs, a local-first video repository is much more +demanding. So the local-first thinking here isn't "just keep everything local", +but "how much time and money am I willing to spend to keep everything local". + +The convenience of "cloud apps" becomes so attractive that many don't even have +a local copy of their videos, and rely exclusively on service providers to +maintain, backup and store their content. + +The dial measuring "cloud apps" and "old-fashioned apps" needs to be specific to +use-cases. + +== Real-time collaboration is optional + +If I were the one making the list of ideals, I wouldn't focus so much on +real-time collaboration. + +Even though seamless collaboration is desired, it being real-time depends on the +network being available for that. But ideal 3 states that "The Network is +Optional", so real-time collaboration is also optional. + +The fundamentals of a local-first system should enable real-time collaboration +when network is available, but shouldn't focus on it. + +On many places when discussing applications being offline, it is common for me +to find people saying that their application works "even on a plane, subway or +elevator". That is a reflection of when said developers have to deal with +networks being unavailable. + +But this leaves out a big chunk of the world where internet connection is +intermittent, or only works every other day or only once a week, or stops +working when it rains, _etc_. For this audience, living without network +connectivity isn't such a discrete moment in time, but part of every day life. +I like the fact that the authors acknowledge that. + +When discussing "working offline", I'd rather keep this type of person in mind, +then the subset of people who are offline when on the elevator will naturally be +included. + +== On CRDTs and developer experience + +:archived-article: https://web.archive.org/web/20130116163535/https://labs.oracle.com/techrep/1994/smli_tr-94-29.pdf + +When discussing developer experience, the authors bring up some questions to be +answered further, like: + +____ +For an app developer, how does the use of a CRDT-based data layer compare to +existing storage layers like a SQL database, a filesystem, or CoreData? Is a +distributed system harder to write software for? +____ + +That is an easy one: yes. + +A distributed system _is_ harder to write software for, being a distributed +system. + +Adding a large layer of data structures and algorithms will make it more complex +to write software for, naturally. And if trying to make this layer transparent +to the programmer, so they can pretend that layer doesn't exist is a bad idea, +as RPC frameworks have tried, and failed. + +See "{archived-article}[A Note on Distributed Computing]" for a critique on RPC +frameworks trying to make the network invisible, which I think also applies in +equivalence for making the CRDTs layer invisible. + +== Conclusion + +I liked a lot the article, as it took the "offline-first" philosophy and ran +with it. + +But I think the authors' view of adding CRDTs and things becoming local-first is +a bit too magical. + +This particular area is one that I have large interest on, and I wish to see +more being done on the "local-first" space. diff --git a/src/content/en/blog/2021/01/26/remembering-ann.adoc b/src/content/en/blog/2021/01/26/remembering-ann.adoc new file mode 100644 index 0000000..6786b3c --- /dev/null +++ b/src/content/en/blog/2021/01/26/remembering-ann.adoc @@ -0,0 +1,216 @@ += ANN: remembering - Add memory to dmenu, fzf and similar tools +:categories: ann + +:remembering: https://euandreh.xyz/remembering/ +:dmenu: https://tools.suckless.org/dmenu/ +:fzf: https://github.com/junegunn/fzf + +Today I pushed v0.1.0 of {remembering}[remembering], a tool to enhance the +interactive usability of menu-like tools, such as {dmenu}[dmenu] and {fzf}[fzf]. + +== Previous solution + +:yeganesh: https://dmwit.com/yeganesh/ + +I previously used {yeganesh}[yeganesh] to fill this gap, but as I started to +rely less on Emacs, I added fzf as my go-to tool for doing fuzzy searching on +the terminal. But I didn't like that fzf always showed the same order of +things, when I would only need 3 or 4 commonly used files. + +For those who don't know: yeganesh is a wrapper around dmenu that will remember +your most used programs and put them on the beginning of the list of +executables. This is very convenient for interactive prolonged use, as with +time the things you usually want are right at the very beginning. + +But now I had this thing, yeganesh, that solved this problem for dmenu, but +didn't for fzf. + +I initially considered patching yeganesh to support it, but I found it more +coupled to dmenu than I would desire. I'd rather have something that knows +nothing about dmenu, fzf or anything, but enhances tools like those in a useful +way. + +== Implementation + +:v-010: https://euandre.org/git/remembering/tree/remembering?id=v0.1.0 +:getopts: https://www.opengroup.org/onlinepubs/9699919799/utilities/getopts.html +:sort: https://www.opengroup.org/onlinepubs/9699919799/utilities/sort.html +:awk: https://www.opengroup.org/onlinepubs/9699919799/utilities/awk.html +:spencer-quote: https://en.wikipedia.org/wiki/Henry_Spencer#cite_note-3 + +Other than being decoupled from dmenu, another improvement I though that could +be made on top of yeganesh is the programming language choice. Instead of +Haskell, I went with POSIX sh. Sticking to POSIX sh makes it require less +build-time dependencies. There aren't any, actually. Packaging is made much +easier due to that. + +The good thing is that the program itself is small enough ({v-010}[119 lines] on +v0.1.0) that POSIX sh does the job just fine, combined with other POSIX +utilities such as {getopts}[getopts], {sort}[sort] and {awk}[awk]. + +The behaviour is: given a program that will read from STDIN and write a single +entry to STDOUT, `remembering` wraps that program, and rearranges STDIN so that +previous choices appear at the beginning. + +Where you would do: + +[source,sh] +---- +$ seq 5 | fzf + + 5 + 4 + 3 + 2 +> 1 + 5/5 +> +---- + +And every time get the same order of numbers, now you can write: + +[source,sh] +---- +$ seq 5 | remembering -p seq-fzf -c fzf + + 5 + 4 + 3 + 2 +> 1 + 5/5 +> +---- + +On the first run, everything is the same. If you picked 4 on the previous +example, the following run would be different: + +[source,sh] +---- +$ seq 5 | remembering -p seq-fzf -c fzf + + 5 + 3 + 2 + 1 +> 4 + 5/5 +> +---- + +As time passes, the list would adjust based on the frequency of your choices. + +I aimed for reusability, so that I could wrap diverse commands with +`remembering` and it would be able to work. To accomplish that, a "profile" +(the `-p something` part) stores data about different runs separately. + +I took the idea of building something small with few dependencies to other +places too: - the manpages are written in troff directly; - the tests are just +more POSIX sh files; - and a POSIX Makefile to `check` and `install`. + +I was aware of the value of sticking to coding to standards, but I had past +experience mostly with programming language standards, such as ECMAScript, +Common Lisp, Scheme, or with IndexedDB or DOM APIs. It felt good to rediscover +these nice POSIX tools, which makes me remember of a quote by +{spencer-quote}[Henry Spencer]: + +____ +Those who do not understand Unix are condemned to reinvent it, poorly. +____ + +== Usage examples + +Here are some functions I wrote myself that you may find useful: + +=== Run a command with fzf on `$PWD` + +[source,sh] +---- +f() { + profile="$f-shell-function(pwd | sed -e 's_/_-_g')" + file="$(git ls-files | \ + remembering -p "$profile" \ + -c "fzf --select-1 --exit -0 --query \"$2\" --preview 'cat {}'")" + if [ -n "$file" ]; then + # shellcheck disable=2068 + history -s f $@ + history -s "$1" "$file" + "$1" "$file" +fi +} +---- + +This way I can run `f vi` or `f vi config` at the root of a repository, and the +list of files will always appear on the most used order. Adding `pwd` to the +profile allows it to not mix data for different repositories. + +=== Copy password to clipboard + +:pass: https://www.passwordstore.org/ + +[source,sh] +---- +choice="$(find "$HOME/.password-store" -type f | \ + grep -Ev '(.git|.gpg-id)' | \ + sed -e "s|$HOME/.password-store/||" -e 's/\.gpg$//' | \ + remembering -p password-store \ + -c 'dmenu -l 20 -i')" + + +if [ -n "$choice" ]; then + pass show "$choice" -c +fi +---- + +Adding the above to a file and binding it to a keyboard shortcut, I can access +the contents of my {pass}[password store], with the entries ordered by usage. + +=== Replacing yeganesh + +Where I previously had: + +[source,sh] +---- +exe=$(yeganesh -x) && exec $exe +---- + +Now I have: + +[source,sh] +---- +exe=$(dmenu_path | remembering -p dmenu-exec -c dmenu) && exec $exe +---- + +This way, the executables appear on order of usage. + +If you don't have `dmenu_path`, you can get just the underlying `stest` tool +that looks at the executables available in your `$PATH`. Here's a juicy +one-liner to do it: + +[source,sh] +---- +$ wget -O- https://dl.suckless.org/tools/dmenu-5.0.tar.gz | \ + tar Ozxf - dmenu-5.0/arg.h dmenu-5.0/stest.c | \ + sed 's|^#include "arg.h"$|// #include "arg.h"|' | \ + cc -xc - -o stest +---- + +With the `stest` utility you'll be able to list executables in your `$PATH` and +pipe them to dmenu or something else yourself: + +[source,sh] +---- +$ (IFS=:; ./stest -flx $PATH;) | sort -u | remembering -p another-dmenu-exec -c dmenu | sh +---- + +In fact, the code for `dmenu_path` is almost just like that. + +== Conclusion + +:packaged: https://euandre.org/git/package-repository/ + +For my personal use, I've {packaged}[packaged] `remembering` for GNU Guix and +Nix. Packaging it to any other distribution should be trivial, or just +downloading the tarball and running `[sudo] make install`. + +Patches welcome! diff --git a/src/content/en/blog/2021/02/17/fallible.adoc b/src/content/en/blog/2021/02/17/fallible.adoc new file mode 100644 index 0000000..1f2f641 --- /dev/null +++ b/src/content/en/blog/2021/02/17/fallible.adoc @@ -0,0 +1,285 @@ += ANN: fallible - Fault injection library for stress-testing failure scenarios +:updatedat: 2022-03-06 + +:fallible: https://euandreh.xyz/fallible/ + +Yesterday I pushed v0.1.0 of {fallible}[fallible], a miniscule library for +fault-injection and stress-testing C programs. + +== _EDIT_ + +:changelog: https://euandreh.xyz/fallible/CHANGELOG.html +:tarball: https://euandre.org/static/attachments/fallible.tar.gz + +2021-06-12: As of {changelog}[0.3.0] (and beyond), the macro interface improved +and is a bit different from what is presented in this article. If you're +interested, I encourage you to take a look at it. + +2022-03-06: I've {tarball}[archived] the project for now. It still needs some +maturing before being usable. + +== Existing solutions + +:gnu-std: https://www.gnu.org/prep/standards/standards.html#Semantics +:valgrind: https://www.valgrind.org/ +:so-alloc: https://stackoverflow.com/questions/1711170/unit-testing-for-failed-malloc + +Writing robust code can be challenging, and tools like static analyzers, fuzzers +and friends can help you get there with more certainty. As I would try to +improve some of my C code and make it more robust, in order to handle system +crashes, filled disks, out-of-memory and similar scenarios, I didn't find +existing tooling to help me get there as I expected to find. I couldn't find +existing tools to help me explicitly stress-test those failure scenarios. + +Take the "{gnu-std}[Writing Robust Programs]" section of the GNU Coding +Standards: + +____ +Check every system call for an error return, unless you know you wish to ignore +errors. (...) Check every call to malloc or realloc to see if it returned NULL. +____ + +From a robustness standpoint, this is a reasonable stance: if you want to have a +robust program that knows how to fail when you're out of memory and `malloc` +returns `NULL`, than you ought to check every call to `malloc`. + +Take a sample code snippet for clarity: + +[source,c] +---- +void a_function() { + char *s1 = malloc(A_NUMBER); + strcpy(s1, "some string"); + + char *s2 = malloc(A_NUMBER); + strcpy(s2, "another string"); +} +---- + +At a first glance, this code is unsafe: if any of the calls to `malloc` returns +`NULL`, `strcpy` will be given a `NULL` pointer. + +My first instinct was to change this code to something like this: + +[source,diff] +---- +@@ -1,7 +1,15 @@ + void a_function() { + char *s1 = malloc(A_NUMBER); ++ if (!s1) { ++ fprintf(stderr, "out of memory, exitting\n"); ++ exit(1); ++ } + strcpy(s1, "some string"); + + char *s2 = malloc(A_NUMBER); ++ if (!s2) { ++ fprintf(stderr, "out of memory, exitting\n"); ++ exit(1); ++ } + strcpy(s2, "another string"); + } +---- + +As I later found out, there are at least 2 problems with this approach: + +. *it doesn't compose*: this could arguably work if `a_function` was `main`. + But if `a_function` lives inside a library, an `exit(1);` is an inelegant way + of handling failures, and will catch the top-level `main` consuming the + library by surprise; +. *it gives up instead of handling failures*: the actual handling goes a bit + beyond stopping. What about open file handles, in-memory caches, unflushed + bytes, etc.? + +If you could force only the second call to `malloc` to fail, +{valgrind}[Valgrind] would correctly complain that the program exitted with +unfreed memory. + +So the last change to make the best version of the above code is: + +[source,diff] +---- +@@ -1,15 +1,14 @@ +-void a_function() { ++bool a_function() { + char *s1 = malloc(A_NUMBER); + if (!s1) { +- fprintf(stderr, "out of memory, exitting\n"); +- exit(1); ++ return false; + } + strcpy(s1, "some string"); + + char *s2 = malloc(A_NUMBER); + if (!s2) { +- fprintf(stderr, "out of memory, exitting\n"); +- exit(1); ++ free(s1); ++ return false; + } + strcpy(s2, "another string"); + } +---- + +Instead of returning `void`, `a_function` now returns `bool` to indicate whether +an error ocurred during its execution. If `a_function` returned a pointer to +something, the return value could be `NULL`, or an `int` that represents an +error code. + +The code is now a) safe and b) failing gracefully, returning the control to the +caller to properly handle the error case. + +After seeing similar patterns on well designed APIs, I adopted this practice for +my own code, but was still left with manually verifying the correctness and +robustness of it. + +How could I add assertions around my code that would help me make sure the +`free(s1);` exists, before getting an error report? How do other people and +projects solve this? + +From what I could see, either people a) hope for the best, b) write safe code +but don't strees-test it or c) write ad-hoc code to stress it. + +The most proeminent case of c) is SQLite: it has a few wrappers around the +familiar `malloc` to do fault injection, check for memory limits, add warnings, +create shim layers for other environments, etc. All of that, however, is +tightly couple with SQLite itself, and couldn't be easily pulled off for using +somewhere else. + +When searching for it online, an {so-alloc}[interesting thread] caught my +atention: fail the call to `malloc` for each time it is called, and when the +same stacktrace appears again, allow it to proceed. + +== Implementation + +:mallocfail: https://github.com/ralight/mallocfail +:should-fail-fn: https://euandre.org/git/fallible/tree/src/fallible.c?id=v0.1.0#n16 + +A working implementation of that already exists: {mallocfail}[mallocfail]. It +uses `LD_PRELOAD` to replace `malloc` at run-time, computes the SHA of the +stacktrace and fails once for each SHA. + +I initially envisioned and started implementing something very similar to +mallocfail. However I wanted it to go beyond out-of-memory scenarios, and using +`LD_PRELOAD` for every possible corner that could fail wasn't a good idea on the +long run. + +Also, mallocfail won't work together with tools such as Valgrind, who want to do +their own override of `malloc` with `LD_PRELOAD`. + +I instead went with less automatic things: starting with a +`fallible_should_fail(char *filename, int lineno)` function that fails once for +each `filename`+`lineno` combination, I created macro wrappers around common +functions such as `malloc`: + +[source,c] +---- +void *fallible_malloc(size_t size, const char *const filename, int lineno) { +#ifdef FALLIBLE + if (fallible_should_fail(filename, lineno)) { + return NULL; + } +#else + (void)filename; + (void)lineno; +#endif + return malloc(size); +} + +#define MALLOC(size) fallible_malloc(size, __FILE__, __LINE__) +---- + +With this definition, I could replace the calls to `malloc` with `MALLOC` (or +any other name that you want to `#define`): + +[source,diff] +---- +--- 3.c 2021-02-17 00:15:38.019706074 -0300 ++++ 4.c 2021-02-17 00:44:32.306885590 -0300 +@@ -1,11 +1,11 @@ + bool a_function() { +- char *s1 = malloc(A_NUMBER); ++ char *s1 = MALLOC(A_NUMBER); + if (!s1) { + return false; + } + strcpy(s1, "some string"); + +- char *s2 = malloc(A_NUMBER); ++ char *s2 = MALLOC(A_NUMBER); + if (!s2) { + free(s1); + return false; +---- + +With this change, if the program gets compiled with the `-DFALLIBLE` flag the +fault-injection mechanism will run, and `MALLOC` will fail once for each +`filename`+`lineno` combination. When the flag is missing, `MALLOC` is a very +thin wrapper around `malloc`, which compilers could remove entirely, and the +`-lfallible` flags can be omitted. + +This applies not only to `malloc` or other `stdlib.h` functions. If +`a_function` is important or relevant, I could add a wrapper around it too, that +checks if `fallible_should_fail` to exercise if its callers are also doing the +proper clean-up. + +The actual code is just this single function, +{should-fail-fn}[`fallible_should_fail`], which ended-up taking only ~40 lines. +In fact, there are more lines of either Makefile (111), README.md (82) or troff +(306) on this first version. + +The price for such fine-grained control is that this approach requires more +manual work. + +== Usage examples + +=== `MALLOC` from the `README.md` + +:fallible-check: https://euandreh.xyz/fallible/fallible-check.1.html + +[source,c] +---- +// leaky.c +#include +#include + +int main() { + char *aaa = MALLOC(100); + if (!aaa) { + return 1; + } + strcpy(aaa, "a safe use of strcpy"); + + char *bbb = MALLOC(100); + if (!bbb) { + // free(aaa); + return 1; + } + strcpy(bbb, "not unsafe, but aaa is leaking"); + + free(bbb); + free(aaa); + return 0; +} +---- + +Compile with `-DFALLIBLE` and run {fallible-check}[`fallible-check.1`]: + +[source,sh] +---- +$ c99 -DFALLIBLE -o leaky leaky.c -lfallible +$ fallible-check ./leaky +Valgrind failed when we did not expect it to: +(...suppressed output...) +# exit status is 1 +---- + +== Conclusion + +:package: https://euandre.org/git/package-repository/ + +For my personal use, I'll {package}[package] them for GNU Guix and Nix. +Packaging it to any other distribution should be trivial, or just downloading +the tarball and running `[sudo] make install`. + +Patches welcome! diff --git a/src/content/en/blog/2021/02/17/fallible.tar.gz b/src/content/en/blog/2021/02/17/fallible.tar.gz new file mode 100644 index 0000000..211cadd Binary files /dev/null and b/src/content/en/blog/2021/02/17/fallible.tar.gz differ diff --git a/src/content/en/blog/2021/04/29/relational-review.adoc b/src/content/en/blog/2021/04/29/relational-review.adoc new file mode 100644 index 0000000..4b53737 --- /dev/null +++ b/src/content/en/blog/2021/04/29/relational-review.adoc @@ -0,0 +1,144 @@ += A Relational Model of Data for Large Shared Data Banks - article-review + +:empty: +:reviewed-article: https://www.seas.upenn.edu/~zives/03f/cis550/codd.pdf + +This is a review of the article "{reviewed-article}[A Relational Model of Data +for Large Shared Data Banks]", by E. F. Codd. + +== Data Independence + +Codd brings the idea of _data independence_ as a better approach to use on +databases. This is contrast with the existing approaches, namely hierarquical +(tree-based) and network-based. + +His main argument is that queries in applications shouldn't depende and be +coupled with how the data is represented internally by the database system. +This key idea is very powerful, and something that we strive for in many other +places: decoupling the interface from the implementation. + +If the database system has this separation, it can kep the querying interface +stable, while having the freedom to change its internal representation at will, +for better performance, less storage, etc. + +This is true for most modern database systems. They can change from B-Trees +with leafs containing pointers to data, to B-Trees with leafs containing the raw +data , to hash tables. All that without changing the query interface, only its +performance. + +Codd mentions that, from an information representation standpoint, any index is +a duplication, but useful for perfomance. + +This data independence also impacts ordering (a _relation_ doesn't rely on the +insertion order). + +== Duplicates + +His definition of relational data is a bit differente from most modern database +systems, namely *no duplicate rows*. + +I couldn't find a reason behind this restriction, though. For practical +purposes, I find it useful to have it. + +== Relational Data + +:edn: https://github.com/edn-format/edn + +In the article, Codd doesn't try to define a language, and today's most popular +one is SQL. + +However, there is no restriction that says that "SQL database" and "relational +database" are synonyms. One could have a relational database without using SQL +at all, and it would still be a relational one. + +The main one that I have in mind, and the reason that led me to reading this +paper in the first place, is Datomic. + +Is uses an {edn}[edn]-based representation for datalog +queries{empty}footnote:edn-queries[ + You can think of it as JSON, but with a Clojure taste. +], and a particular schema used to represent data. + +Even though it looks very weird when coming from SQL, I'd argue that it ticks +all the boxes (except for "no duplicates") that defines a relational database, +since building relations and applying operations on them is possible. + +Compare and contrast a contrived example of possible representations of SQL and +datalog of the same data: + +[source,sql] +---- +-- create schema +CREATE TABLE people ( + id UUID PRIMARY KEY, + name TEXT NOT NULL, + manager_id UUID, + FOREIGN KEY (manager_id) REFERENCES people (id) +); + +-- insert data +INSERT INTO people (id, name, manager_id) VALUES + ("d3f29960-ccf0-44e4-be66-1a1544677441", "Foo", "076356f4-1a0e-451c-b9c6-a6f56feec941"), + ("076356f4-1a0e-451c-b9c6-a6f56feec941", "Bar"); + +-- query data, make a relation + +SELECT employees.name AS 'employee-name', + managers.name AS 'manager-name' +FROM people employees +INNER JOIN people managers ON employees.manager_id = managers.id; +---- + +[source,clojure] +---- +;; create schema +#{{:db/ident :person/id + :db/valueType :db.type/uuid + :db/cardinality :db.cardinality/one + :db/unique :db.unique/value} + {:db/ident :person/name + :db/valueType :db.type/string + :db/cardinality :db.cardinality/one} + {:db/ident :person/manager + :db/valueType :db.type/ref + :db/cardinality :db.cardinality/one}} + +;; insert data +#{{:person/id #uuid "d3f29960-ccf0-44e4-be66-1a1544677441" + :person/name "Foo" + :person/manager [:person/id #uuid "076356f4-1a0e-451c-b9c6-a6f56feec941"]} + {:person/id #uuid "076356f4-1a0e-451c-b9c6-a6f56feec941" + :person/name "Bar"}} + +;; query data, make a relation +{:find [?employee-name ?manager-name] + :where [[?person :person/name ?employee-name] + [?person :person/manager ?manager] + [?manager :person/name ?manager-name]]} +---- + +(forgive any errors on the above SQL and datalog code, I didn't run them to +check. Patches welcome!) + +This employee example comes from the paper, and both SQL and datalog +representations match the paper definition of "relational". + +Both "Foo" and "Bar" are employees, and the data is normalized. SQL represents +data as tables, and Datomic as datoms, but relations could be derived from both, +which we could view as: + +[source,sql] +---- +employee_name | manager_name +---------------------------- +"Foo" | "Bar" +---- + +== Conclusion + +The article also talks about operators, consistency and normalization, which are +now so widespread and well-known that it feels a bit weird seeing someone +advocating for it. + +I also stablish that `relational != SQL`, and other databases such as Datomic +are also relational, following Codd's original definition. diff --git a/src/content/en/blog/categories.adoc b/src/content/en/blog/categories.adoc new file mode 100644 index 0000000..f29acda --- /dev/null +++ b/src/content/en/blog/categories.adoc @@ -0,0 +1 @@ += Articles by category diff --git a/src/content/en/blog/index.adoc b/src/content/en/blog/index.adoc new file mode 100644 index 0000000..afd64d4 --- /dev/null +++ b/src/content/en/blog/index.adoc @@ -0,0 +1 @@ += Blog -- cgit v1.2.3