I was introduced to a simple optimization problem by one of my interns last week. He was about to participate in a friendly competition where they were asked to find a performant solution to the problem:
You have a file containing up to 6 million lines where each line has the following
form: ABC123
. Letters used are A-Z
expect I
, Q
and V
. Digits used are 0-9
.
You only need to answer yes or no to whether the file contains duplicates.
The performance measurement shall include the cost of reading file to memory.
You can find test data here.
First, I decided to solve the problem as quick as I could (ie development time, not execution time).
So I built a simple F# program:
let clock =
let sw = System.Diagnostics.Stopwatch ()
sw.Start ()
sw
let lookForDuplicatesReference (fn : string) : unit =
printfn "Looking for duplicates in %A (reference implementation)..." fn
let before = clock.ElapsedMilliseconds
use input = new System.IO.StreamReader(fn)
let set = System.Collections.Generic.HashSet<string> ()
let rec loop d =
let l = input.ReadLine ()
if l <> null then
if set.Add l |> not then
printfn " Duplicate detected: %A" l
loop (d + 1)
else
loop d
else
d
let dupes = loop 0
let now = clock.ElapsedMilliseconds
printfn " %A duplicates found, it took %d ms" dupes (now - before)
On my (decent) machine I got the following performance numbers:
Looking for duplicates in "C:\temp\GoForPerf\Rgn00.txt" (reference implementation)...
Duplicate detected: "UKS855"
1 duplicates found, it took 117 ms
Looking for duplicates in "C:\temp\GoForPerf\Rgn01.txt" (reference implementation)...
Duplicate detected: "GEE848"
1 duplicates found, it took 2044 ms
Looking for duplicates in "C:\temp\GoForPerf\Rgn02.txt" (reference implementation)...
0 duplicates found, it took 2284 ms
Then I brought out my calculator, the number of possible of keys are:
23*23*23*10*10*10 ==> 12167000 ~= 12 million
The file contains up to 6 million keys. That means that set will have a fill rate of approximately 50% since we don't have many duplicates.
Because of this the set used to track if we had any duplicates can be implemented
as bool []
and when reading the key we convert the string key into an integer which
has a one to one to the key. We check the position in the set and if it's true we
seen the key before, otherwise we set it to true. To reduce the cost IO operations
I read the lines into a byte []
let lookForDuplicates (fn : string) : unit =
printfn "Looking for duplicates in %A..." fn
let before = clock.ElapsedMilliseconds
let inline letter m c = m * (int c - int 'A')
let inline digit m c = m * (int c - int '0')
let set : bool [] = Array.zeroCreate (26*26*26*1000)
use input = System.IO.File.OpenRead fn
let bs : byte [] = Array.zeroCreate 8
let rec loop d =
if input.Read (bs, 0, bs.Length) = 8 then
let i =
(letter (26*26*1000) bs.[0])
+ (letter (26*1000) bs.[1])
+ (letter (1000) bs.[2])
+ (digit (100) bs.[3])
+ (digit (10) bs.[4])
+ (digit (1) bs.[5])
if set.[i] then
printfn " Duplicate detected: %c%c%c%c%c%c" (char bs.[0]) (char bs.[1]) (char bs.[2]) (char bs.[3]) (char bs.[4]) (char bs.[5])
loop (d + 1)
else
set.[i] <- true
loop d
else
d
let dupes = loop 0
let now = clock.ElapsedMilliseconds
printfn " %A duplicates found, it took %d ms" dupes (now - before)
This gave the following performance numbers:
Looking for duplicates in "C:\temp\GoForPerf\Rgn00.txt"...
Duplicate detected: UKS855
1 duplicates found, it took 47 ms
Looking for duplicates in "C:\temp\GoForPerf\Rgn01.txt"...
Duplicate detected: GEE848
1 duplicates found, it took 388 ms
Looking for duplicates in "C:\temp\GoForPerf\Rgn02.txt"...
0 duplicates found, it took 390 ms
A roughly 6x performance improvement. Not bad.
At this point I thought it was interesting to measure the cost of IO:
Reading file "C:\temp\GoForPerf\Rgn00.txt"...
500000 lines, it took 9 ms
Reading file "C:\temp\GoForPerf\Rgn01.txt"...
6000000 lines, it took 104 ms
Reading file "C:\temp\GoForPerf\Rgn02.txt"...
6000000 lines, it took 119 ms
Press any key to continue . . .
The IO seems to be 25% of the cost of finding the duplicates, it means there's room to improve the algorithm.
On my machine I have the following cache sizes:
L1: 256 KiB
L2: 1 MiB
L3: 6 MiB
The size of of the set in memory is
26*26*26*10*10*10 ==> 17576000 ~= 17 MiB
If the keys are randomly distributed it means that roughly 60% of the set reads will be reads from RAM (ie slow).
We can easily see this affects performance by changing from bool []
to int []
This set is ~68 MiB in size.
Looking for duplicates in "C:\temp\GoForPerf\Rgn00.txt"...
Duplicate detected: UKS855
1 duplicates found, it took 71 ms
Looking for duplicates in "C:\temp\GoForPerf\Rgn01.txt"...
Duplicate detected: GEE848
1 duplicates found, it took 595 ms
Looking for duplicates in "C:\temp\GoForPerf\Rgn02.txt"...
0 duplicates found, it took 570 ms
Press any key to continue . . .
A performance degradation of 50%.
In order to improve cache usage we should minimize the size of the set in memory. One way to do so is encoding each entry as a bit. We will have to do a bit more bit fiddling but it should improve cache locality and therefore the cost could be worth it.
The size of this set would be ~2 MiB in size which easily fits into L3 and 50% of it fits into L2.
let lookForDuplicatesDenseSet (fn : string) : unit =
printfn "Looking for duplicates in %A..." fn
let before = clock.ElapsedMilliseconds
let inline letter m c = m * (int c - int 'A')
let inline digit m c = m * (int c - int '0')
let set : uint64 [] = Array.zeroCreate (26*26*26*1000 / 64)
use input = System.IO.File.OpenRead fn
let bs : byte [] = Array.zeroCreate 8
let rec loop d =
if input.Read (bs, 0, bs.Length) = 8 then
let i =
(letter (26*26*1000) bs.[0])
+ (letter (26*1000) bs.[1])
+ (letter (1000) bs.[2])
+ (digit (100) bs.[3])
+ (digit (10) bs.[4])
+ (digit (1) bs.[5])
let p = i >>> 6
let b = i &&& 0x3F
let bt = 1UL <<< b
let sv = set.[p]
if (sv &&& bt) <> 0UL then
printfn " Duplicate detected: %c%c%c%c%c%c" (char bs.[0]) (char bs.[1]) (char bs.[2]) (char bs.[3]) (char bs.[4]) (char bs.[5])
loop (d + 1)
else
set.[p] <- sv ||| bt
loop d
else
d
let dupes = loop 0
let now = clock.ElapsedMilliseconds
printfn " %A duplicates found, it took %d ms" dupes (now - before)
Performance numbers:
Looking for duplicates in "C:\temp\GoForPerf\Rgn00.txt"...
Duplicate detected: UKS855
1 duplicates found, it took 43 ms
Looking for duplicates in "C:\temp\GoForPerf\Rgn01.txt"...
Duplicate detected: GEE848
1 duplicates found, it took 236 ms
Looking for duplicates in "C:\temp\GoForPerf\Rgn02.txt"...
0 duplicates found, it took 175 ms
Press any key to continue . . .
Compared to a previous best result this is roughly 100% increase in performance even though the code is more complex.
If we look at Rng02.txt
we see that the IO cost is roughly: 110 ms, the total cost is:
180 ms meaning the CPU cost is: 70 ms. If we deduct the IO cost of the previously best result we see that we improved the CPU cost from 280 ms to 70 ms. From that perspective we improved the performance 4x by having better cache locality. So cache locality is important for performance.
As IO is the main cost right now lets look at reducing the cost of IO in more detail.
I did some experimentation with MemoryMappedFile
in F#
but got disappointing results
although I know they are very fast from my previous experience with them in C++
.
Turns out MemoryMappedFile
exposes low-level handles which can be used to reduce overhead significantly. See "Update 2016-10-09" below
So I decided to create a C++
program. You can find it
here
The performance numbers are:
Looking for duplicates in C:\temp\GoForPerf\Rgn00.txt
Duplicate found: UKS855
it took 9 ms
Looking for duplicates in C:\temp\GoForPerf\Rgn01.txt
Duplicate found: GEE848
it took 49 ms
Looking for duplicates in C:\temp\GoForPerf\Rgn02.txt
it took 46 ms
Press any key to continue . . .
Those numbers look pretty decent. Let's check them against what should be theoretical maximum.
My machine is running at 3.4 GHz
. We process 48000000
characters in 46 ms
.
That gives 3.25
cycles per character. That includes IO. To compare, reading an
entry from L1 cache cost 3
cycles. It probably can be improved but I think it's decent.
I had some hopes the performance could be improved by running on multiple cores especially since I thought I could use the fact that L2 cache is unique per core and by writing the algorithm in such a way that the different cores see only a fraction of the keys they wouldn't need to goto L3 cache.
Looking for duplicates in C:\temp\GoForPerf\Rgn00.txt
Duplicate found: UKS855
it took 14 ms
Looking for duplicates in C:\temp\GoForPerf\Rgn01.txt
Duplicate found: GEE848
it took 91 ms
Looking for duplicates in C:\temp\GoForPerf\Rgn02.txt
it took 71 ms
Press any key to continue . . .
So it looks like the parallel algorithm is roughly 50% slower but it's much much worse.
I have 4 cores on my machine so the parallel algorithm uses 6x more CPU resources to compute the answer as the best non-parallel algorithm!
The final solution is 46x times faster than the first attempt. A pretty decent improvement.
All in all, the intern shared a pretty cool exercise in optimizing a simple problem.
My solutions used rather straight-forward techniques that didn't use additional facts
such that Q
, I
and V
isn't used (this could reduce the size of the set even further)
or that only a yes/no answer is needed. Perhaps someone can come up with something even better.
My venture into parallelism was disappointing but perhaps I did something wrong.
Perhaps SIMD could help improve performance by applying AVX intrinsics to the problem so that one 4 processes keys at once (I suck at AVX so I haven't tried it yet).
A final note is that the OS is able to cache the file which means that IO reads were hot reads. If the reads were cold the IO cost should be significantly higher.
The managed MemoryMappedFile
APIs adds costly overhead but they expose low-level handles which
allows us to write unsafe F#
that can almost match C++
performance. The small difference there is is probably from that C++
intrinsics allows use of x64
instruction bts
that tests and sets a bit where F#
has to load a 64 bit word and apply a computed mask.
let lookForDuplicatesMemoryMappedFile (fn : string) : unit =
printfn "Looking for duplicates in %A..." fn
let before = clock.ElapsedMilliseconds
let set : uint64 [] = Array.zeroCreate (26*26*26*1000 / 64)
let fi = System.IO.FileInfo fn
let fsz = fi.Length
use mmf = System.IO.MemoryMappedFiles.MemoryMappedFile.CreateFromFile fn
use mmv = mmf.CreateViewAccessor (0L, fsz, System.IO.MemoryMappedFiles.MemoryMappedFileAccess.Read)
let start =
let ptr = NativeInterop.NativePtr.ofNativeInt 0n |> ref
mmv.SafeMemoryMappedViewHandle.AcquirePointer ptr
!ptr
let inline read ptr i = NativeInterop.NativePtr.get ptr i
let inline shift ptr m i = m * (int (read ptr i))
let inline char ptr i = read ptr i |> char
// deduct is used to reduce the number of subtractions
let deduct =
int 'A'*26*26*1000 + int 'A'*26*1000 + int 'A'*1000
+ int '0'*100 + int '0'*10 + int '0'*1
let rec unsafeloop ptr rem d=
if rem > 0L then
let i =
(shift ptr (26*26*1000) 0)
+ (shift ptr (26*1000) 1)
+ (shift ptr (1000) 2)
+ (shift ptr (100) 3)
+ (shift ptr (10) 4)
+ (shift ptr (1) 5)
- deduct
let p = i >>> 6
let b = i &&& 0x3F
let bt = 1UL <<< b
let sv = set.[p]
if (sv &&& bt) <> 0UL then
printfn " Duplicate detected: %c%c%c%c%c%c" (char ptr 0) (char ptr 1) (char ptr 2) (char ptr 3) (char ptr 4) (char ptr 5)
unsafeloop (NativeInterop.NativePtr.add ptr 8) (rem - 1L) (d + 1)
else
set.[p] <- sv ||| bt
unsafeloop (NativeInterop.NativePtr.add ptr 8) (rem - 1L) d
else
d
let dupes = unsafeloop start (fsz / 8L) 0
let now = clock.ElapsedMilliseconds
printfn " %A duplicates found, it took %d ms" dupes (now - before)
The results:
Looking for duplicates in "C:\temp\GoForPerf\Rgn00.txt"...
Duplicate detected: UKS855
1 duplicates found, it took 16 ms
Looking for duplicates in "C:\temp\GoForPerf\Rgn01.txt"...
Duplicate detected: GEE848
1 duplicates found, it took 52 ms
Looking for duplicates in "C:\temp\GoForPerf\Rgn02.txt"...
0 duplicates found, it took 56 ms