Cost of using unbuffered io

2023-03-17

It's easy to think that when using compiled languages like rust or c++ we get high performance almost for free. But sometimes it's easy to lose all of that speed for trivial reasons.

One such example is using various convenience functions when doing I/O. Let's take as an example parsing json file using serde_json:

fn main() -> Result<()> {
    let opt = Opt::from_args();
    let file = File::open(opt.path)?;
    let content: Value = match (opt.buffer, opt.buffer_size) {
        (false, _) => serde_json::from_reader(file)?,
        (true, None) => serde_json::from_reader(BufReader::new(file))?,
        (true, Some(n)) => serde_json::from_reader(BufReader::with_capacity(n, file))?,
    };
    dbg!(content.is_object());
    Ok(())
}

When we run that program under hyperfine we will see significant increase in spead in the buffered case:

$ hyperfine './target/release/unbuffered ./test.json' './target/release/unbuffered ./test.json --buffer'
    Finished release [optimized] target(s) in 0.02s
Benchmark 1: ./target/release/unbuffered ./test.json
  Time (mean ± σ):     19.737 s ±  0.131 s    [User: 6.891 s, System: 12.843 s]
  Range (min … max):   19.596 s … 20.040 s    10 runs
 
Benchmark 2: ./target/release/unbuffered ./test.json --buffer
  Time (mean ± σ):     550.4 ms ±   5.5 ms    [User: 494.0 ms, System: 56.2 ms]
  Range (min … max):   538.8 ms … 558.2 ms    10 runs
 
Summary
  './target/release/unbuffered ./test.json --buffer' ran
   35.86 ± 0.43 times faster than './target/release/unbuffered ./test.json'

Since buffering is so much faster lets see how many read syscalls each version emits:

$ strace -e read ./target/release/unbuffered test.json 2>&1 | wc -l
26141351
$ strace -e read ./target/release/unbuffered test.json --buffer 2>&1 | wc -l
3200

The number of read syscalls in unbuffered case is very close to the size of test.json in bytes - 26141343. We can easily check first few read syscalls:

$ strace -e read ./target/release/unbuffered test.json  2>&1 | head
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0P\237\2\0\0\0\0\0"..., 832) = 832
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\0\0\0\0\0\0\0"..., 832) = 832
read(3, "55c65f639000-55c65f645000 r--p 0"..., 1024) = 1024
read(3, "15000-7f6b53019000 r--p 00214000"..., 1024) = 1024
read(3, "57                   /usr/lib/x8"..., 1024) = 873
read(3, "[", 1)                         = 1
read(3, "{", 1)                         = 1
read(3, "\"", 1)                        = 1
read(3, "i", 1)                         = 1
read(3, "d", 1)                         = 1

We can see that the json file is being read starting from 5th read syscall. Each syscall reads only one byte. This is in a big contrast to the buffered case where we can see that each read syscall reads 8192 bytes:

$ strace -e read ./target/release/unbuffered test.json --buffer 2>&1 | head
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0P\237\2\0\0\0\0\0"..., 832) = 832
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\0\0\0\0\0\0\0\0"..., 832) = 832
read(3, "56509c94a000-56509c956000 r--p 0"..., 1024) = 1024
read(3, "15000-7f8dacc19000 r--p 00214000"..., 1024) = 1024
read(3, "57                   /usr/lib/x8"..., 1024) = 873
read(3, "[{\"id\":\"2489651045\",\"type\":\"Crea"..., 8192) = 8192
read(3, ",\"target_commitish\":\"master\",\"na"..., 8192) = 8192
read(3, "l\":\"https://api.github.com/users"..., 8192) = 8192
read(3, "ownloads_url\":\"https://api.githu"..., 8192) = 8192
read(3, "/api.github.com/repos/edx/edx-pl"..., 8192) = 8192

Would increasing the buffer size improve the speed even more? Not really:

$ hyperfine './target/release/unbuffered --buffer --buffer-size 65536 ./test.json' './target/release/unbuffered ./test.json --buffer'
Benchmark 1: ./target/release/unbuffered --buffer --buffer-size 65536 ./test.json
  Time (mean ± σ):     540.2 ms ±   8.2 ms    [User: 489.8 ms, System: 49.7 ms]
  Range (min … max):   529.9 ms … 557.7 ms    10 runs
 
Benchmark 2: ./target/release/unbuffered ./test.json --buffer
  Time (mean ± σ):     541.2 ms ±   5.5 ms    [User: 481.1 ms, System: 59.2 ms]
  Range (min … max):   531.4 ms … 546.4 ms    10 runs
 
Summary
  './target/release/unbuffered --buffer --buffer-size 65536 ./test.json' ran
    1.00 ± 0.02 times faster than './target/release/unbuffered ./test.json --buffer'