Skip to content

Instantly share code, notes, and snippets.

@czgdp1807
Last active March 16, 2023 13:21
Show Gist options
  • Save czgdp1807/e7f7b6ca52c57b16b27ec8d0259c6d4a to your computer and use it in GitHub Desktop.
Save czgdp1807/e7f7b6ca52c57b16b27ec8d0259c6d4a to your computer and use it in GitHub Desktop.

Codes

from ltypes import i32, f64

def test_dict():
    rollnumber2cpi: dict[i32, f64] = {0: 1.1}
    i: i32
    size: i32 = 100000000
    total: f64 = 0

    for i in range(1000, 1000 + size):
        rollnumber2cpi[i] = float(i/100.0 + 5.0)

    for i in range(1000, 1000 + size):
        total += rollnumber2cpi[i]

    print(total, len(rollnumber2cpi))

test_dict()
#include <unordered_map>
#include <iostream>

void test_dict() {
    std::unordered_map<int32_t, double> rollnumber2cpi;
    int32_t i, size = 100000000;
    double total = 0;

    for(i = 1000; i < 1000 + size; i++) {
        rollnumber2cpi[i] = double(i/100.0 + 5.0);
    }

    for(i = 1000; i < 1000 + size; i++) {
        total += rollnumber2cpi[i];
    }

    std::cout<<total;
}

int main() {
    test_dict();
}
#include <unordered_map>
#include <iostream>

size_t size = 100000000;

struct ModuloHash {
    size_t operator() (int32_t key) const
    {
        return key % size;
    }
};

void test_dict() {
    std::unordered_map<int32_t, double, ModuloHash> rollnumber2cpi;
    int32_t i;
    double total = 0;
    rollnumber2cpi.reserve(size);

    for(i = 1000; i < 1000 + size; i++) {
        rollnumber2cpi[i] = double(i/100.0 + 5.0);
    }

    for(i = 1000; i < 1000 + size; i++) {
        total += rollnumber2cpi[i];
    }

    std::cout<<total<<" "<<rollnumber2cpi.size();
}

int main() {
    test_dict();
}

codon code

def test_dict():
    rollnumber2cpi = {0: 1.1}
    size = 100000000
    total = 0.0

    for i in range(1000, 1000 + size):
        rollnumber2cpi[i] = float(i/100.0 + 5.0)

    for i in range(1000, 1000 + size):
        total += rollnumber2cpi[i]

    print(total, len(rollnumber2cpi))

test_dict()

Apple M1 Macbook Pro macOS Monterey 12.5

Compiler Time [s] Relative
LPython (dict02) 2.502 2.00
LPython (dict03) 1.87 1.5
LPython (dict03) OptimizedLinearProbing 1.85 1.48
LPython SeparateChaining 4.931 4.0
LPython (dict_neg_keys) --fast 0.984 0.8
LPython (dict_neg_keys) 1.922 1.5
clang++ 13.1.6 arm64-apple-darwin21.6.0 -std=c++11 30.814 24.73
clang++ 13.1.6 arm64-apple-darwin21.6.0 38.581 30.96
clang++ 13.1.6 arm64-apple-darwin21.6.0 ModuloHash 36.846 29.457
LPython (dict02) --fast 1.246 1.0
LPython (dict03) --fast 1.1 0.88
LPython (dict03) --fast OptimizedLinearProbing 1.104 0.88
LPython --fast SeparateChaining 4.270 3.4
clang++ 13.1.6 arm64-apple-darwin21.6.0 -O3 -funroll-loops -std=c++11 7.386 5.93
clang++ 13.1.6 arm64-apple-darwin21.6.0 -O3 -funroll-loops 7.573 6.08
clang++ 13.1.6 arm64-apple-darwin21.6.0 -O3 -funroll-loops -ffast-math ModuloHash 7.814 6.27
Python 3.10.4 22.678 18.20
codon 0.15.5 (codon build -release -exe and time ./executable) 4.983 4.0

Both the computation time (user) and system calls (sys) are lower for LPython.

(lp) 12:52:38:~/lpython_project/lpython % clang++ /Users/czgdp1807/lpython_project/dict_bench_hash.cpp
(lp) 12:52:44:~/lpython_project/lpython % time ./a.out
5.00015e+13 100000000./a.out  34.17s user 1.90s system 96% cpu 37.385 total
(lp) 12:53:24:~/lpython_project/lpython % clang++ -O3 -funroll-loops -ffast-math /Users/czgdp1807/lpython_project/dict_bench_hash.cpp
(lp) 12:54:03:~/lpython_project/lpython % time ./a.out
5.00015e+13 100000000./a.out  5.47s user 1.73s system 87% cpu 8.260 total
(lp) 12:54:14:~/lpython_project/lpython % lpython /Users/czgdp1807/lpython_project/dict_bench.py
(lp) 12:54:25:~/lpython_project/lpython % time ./a.out
50001499500000.00000000000000000 100000001
./a.out  1.35s user 0.33s system 99% cpu 1.691 total
(lp) 12:54:30:~/lpython_project/lpython % lpython --fast /Users/czgdp1807/lpython_project/dict_bench.py
(lp) 12:54:49:~/lpython_project/lpython % time ./a.out
50001499500000.00000000000000000 100000001
./a.out  0.49s user 0.33s system 94% cpu 0.866 total
(lp) 12:54:52:~/lpython_project/lpython %

Ubuntu 18.04.6 on Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz

Compiler Time [s] Relative
LPython (dict02) 2.801 1.23
LPython SeparateChaining 4.738 2.1
clang++ version 6.0.0-1ubuntu2 -std=c++11 34.041 15.03
clang++ version 6.0.0-1ubuntu2 34.189 15.09
clang++ version 6.0.0-1ubuntu2 ModuloHash 28.280 12.485
g++ 7.5.0 39.505 17.44
LPython (dict02) --fast 2.265 1.0
LPython --fast SeparateChaining 4.162 1.8
clang++ version 6.0.0-1ubuntu2 -O3 -march=native -funroll-loops -std=c++11 6.482 2.86
clang++ version 6.0.0-1ubuntu2 -O3 -march=native -funroll-loops 6.47 2.85
clang++ version 6.0.0-1ubuntu2 -O3 -march=native -funroll-loops -ffast-math ModuloHash 6.402 2.82
g++ 7.5.0 -O3 -march=native -funroll-loops 8.427 3.72
Python 3.10.5 15.454 (Killed In Between by the OS) 6.82

Benchmarking with large number of collisions while inserting keys into the hash map.

Codes

from ltypes import i32, f64

def test_dict():
    rollnumber2cpi: dict[i32, f64] = {0: 1.1}
    i: i32
    size: i32 = 700000000
    total: f64 = 0

    for i in range(1000, 1000 + size, 7):
        rollnumber2cpi[i] = float(i/100.0 + 5.0)

    for i in range(1000, 1000 + size, 7):
        total += rollnumber2cpi[i]

    print(total - 350001496500000.06250000000000000, len(rollnumber2cpi))

test_dict()
#include <unordered_map>
#include <iostream>


void test_dict() {
    std::unordered_map<int32_t, double> rollnumber2cpi;
    int32_t i, size = 700000000;
    rollnumber2cpi.reserve(size/7);
    double total = 0;

    for(i = 1000; i < 1000 + size; i += 7) {
        rollnumber2cpi[i] = double(i/100.0 + 5.0);
    }

    for(i = 1000; i < 1000 + size; i += 7) {
        total += rollnumber2cpi[i];
    }

    std::cout<<(total - 350001496500000.06250000000000000)<<" "<<rollnumber2cpi.size();
}

int main() {
    test_dict();
}
#include <unordered_map>
#include <iostream>

size_t size = 700000000;

struct ModuloHash {
    size_t operator() (int32_t key) const
    {
        return key % (size / 7);
    }
};

void test_dict() {
    std::unordered_map<int32_t, double, ModuloHash> rollnumber2cpi;
    int32_t i;
    double total = 0;
    rollnumber2cpi.reserve(size/7);

    for(i = 1000; i < 1000 + size; i += 7) {
        rollnumber2cpi[i] = double(i/100.0 + 5.0);
    }

    for(i = 1000; i < 1000 + size; i += 7) {
        total += rollnumber2cpi[i];
    }

    std::cout<<(total - 350001496500000.06250000000000000)<<" "<<rollnumber2cpi.size();
}

int main() {
    test_dict();
}

codon code

def test_dict():
    rollnumber2cpi = {0: 1.1}
    size = 700000000
    total = 0.0

    for i in range(1000, 1000 + size, 7):
        rollnumber2cpi[i] = float(i/100.0 + 5.0)

    for i in range(1000, 1000 + size, 7):
        total += rollnumber2cpi[i]

    print(total - 350001496500000.06250000000000000, len(rollnumber2cpi))

test_dict()
Compiler Time [s] Relative
LPython (main) --fast 2.024 1.0
LPython (main) 3.043 1.5
LPython (dict_neg_keys) --fast 2.612 1.3
LPython (dict_neg_keys) 3.767 1.86
LPython --fast SeparateChaining 4.300 2.1
LPython SeparateChaining 4.679 2.3
codon 0.15.5 (codon build -release -exe and time ./executable) 4.822 2.38
clang++ 13.1.6 arm64-apple-darwin21.6.0 -O3 -funroll-loops -ffast-math ModuloHash 6.832 3.4
clang++ 13.1.6 arm64-apple-darwin21.6.0 -O3 -funroll-loops -ffast-math 6.908 3.4
Python 3.10.4 23.651 11.7
clang++ 13.1.6 arm64-apple-darwin21.6.0 37.670 18.6
clang++ 13.1.6 arm64-apple-darwin21.6.0 ModuloHash 36.132 17.8

Codes

from ltypes import i32, f64

def test_dict():
    rollnumber2cpi: dict[i32, f64] = {}
    i: i32
    size: i32 = 700000000
    total: f64 = 0

    for i in range(1000, 1000 + size, 7):
        rollnumber2cpi[i] = float(i/100.0 + 5.0)

    for i in range(1000, 1000 + size, 7):
        total += rollnumber2cpi.pop(i)

    print(total - 350001496500000.06250000000000000, len(rollnumber2cpi))

test_dict()
#include <unordered_map>
#include <iostream>

size_t size = 700000000;

struct ModuloHash {
    size_t operator() (int32_t key) const
    {
        return key % size;
    }
};

void test_dict() {
    std::unordered_map<int32_t, double, ModuloHash> rollnumber2cpi;
    int32_t i;
    double total = 0;

    for(i = 1000; i < 1000 + size; i += 7) {
        rollnumber2cpi[i] = double(i/100.0 + 5.0);
    }

    for(i = 1000; i < 1000 + size; i += 7) {
        total += rollnumber2cpi[i];
        rollnumber2cpi.erase(i);
    }

    std::cout<<(total - 350001496500000.06250000000000000)<<" "<<rollnumber2cpi.size()<<"\n";
}

int main() {
    test_dict();
}

codon code

def test_dict():
    rollnumber2cpi = {}
    size = 700000000
    total = 0.0

    for i in range(1000, 1000 + size, 7):
        rollnumber2cpi[i] = float(i/100.0 + 5.0)

    for i in range(1000, 1000 + size, 7):
        total += rollnumber2cpi.pop(i)

    print(total - 350001496500000.06250000000000000, len(rollnumber2cpi))

test_dict()
Compiler Time [s] Relative
LPython (dict05) --fast OptimizedLinearProbing 2.315 1.0
LPython (dict05) OptimizedLinearProbing 3.383 1.5
LPython --fast SeparateChaining 3.903 1.7
LPython (dict05) --fast 3.928 1.7
LPython SeparateChaining 4.212 1.8
codon 0.15.5 (codon build -release -exe and time ./executable) 4.456 2.0
LPython (dict05) 4.735 2.0
clang++ 13.1.6 arm64-apple-darwin21.6.0 -O3 -funroll-loops -ffast-math 10.731 4.6
Python 3.10.4 21.690 9.4
clang++ 13.1.6 arm64-apple-darwin21.6.0 56.361 24.3

Same as https://gist.github.com/czgdp1807/e7f7b6ca52c57b16b27ec8d0259c6d4a#file-dict_bench_str-md but with 64-bit integers as keys.

The difference can be seen clearly. With strings as keys, LPython cannot avail the benefits of,

  1. Spatial Locality of array elements (major factor).
  2. And each stringification calls C-runtime (minor factor accounts for 10% of the total time on my Apple M1 MacBook Pro).

Codes

from ltypes import i32, i64

def test_dict():
    number2cpi: dict[i64, i64] = {}
    i: i32
    size: i32 = 10000000
    c: i64 = 1048576
    correct: i64

    for i in range(1000, 1000 + size, 1):
        number2cpi[c + i] = int(i)

    correct = 0
    for i in range(1000, 1000 + size, 1):
        correct = (correct + number2cpi.pop(c + i))

    # 50009995000000 doesn't fit into i32
    # as a constant, so splitting it into two
    # parts and saving each into 64 bit so
    # that the product goes into 64 bit
    # where it fits easily
    answer1: i64 = 500099950
    answer2: i64 = 100000
    print(correct - answer1 * answer2, correct)
    assert correct == answer1 * answer2

test_dict()
#include <iostream>
#include <unordered_map>

void test_dict() {
    std::unordered_map<int64_t, int64_t> number2cpi;
    int32_t i, size = 10000000;
    int64_t c = 1048576;
    int64_t correct;

    for (i = 1000; i < 1000 + size; i++) {
        number2cpi[c + i] = i;
    }

    correct = 0;
    for (i = 1000; i < 1000 + size; i++) {
        correct = (correct + number2cpi[c + i]);
        number2cpi.erase(c + i);
    }

    /* 50009995000000 doesn't fit into i32
       as a constant, so splitting it into two
       parts and saving each into 64 bit so
       that the product goes into 64 bit
       where it fits easily */
    int64_t answer1 = 500099950;
    int64_t answer2 = 100000;
    std::cout<<(correct - answer1 * answer2)<<" "<<correct<<std::endl;
}

int main() {
    test_dict();
    return 0;
}
#include <unordered_map>
#include <iostream>
#include <string>

uint32_t size = 10000000;

struct ModuloHash {
    size_t operator() (int32_t key) const
    {
        return key % size;
    }
};

#include <iostream>
#include <unordered_map>
#include <string>

void test_dict() {
    std::unordered_map<int64_t, int64_t, ModuloHash> number2cpi;
    int32_t i;
    int64_t c = 1048576;
    int64_t correct;

    for (i = 1000; i < 1000 + size; i++) {
        number2cpi[c + i] = i;
    }

    correct = 0;
    for (i = 1000; i < 1000 + size; i++) {
        correct = (correct + number2cpi[c + i]);
        number2cpi.erase(c + i);
    }

    /* 50009995000000 doesn't fit into i32
       as a constant, so splitting it into two
       parts and saving each into 64 bit so
       that the product goes into 64 bit
       where it fits easily */
    int64_t answer1 = 500099950;
    int64_t answer2 = 100000;
    std::cout<<(correct - answer1 * answer2)<<" "<<correct<<std::endl;
}

int main() {
    test_dict();
    return 0;
}

codon code

def test_dict():
    number2cpi = {}
    size = 10000000
    c = 1048576

    for i in range(1000, 1000 + size, 1):
        number2cpi[c + i] = int(i)

    correct = 0
    for i in range(1000, 1000 + size, 1):
        correct = (correct + number2cpi.pop(c + i))

    # 50009995000000 doesn't fit into i32
    # as a constant, so splitting it into two
    # parts and saving each into 64 bit so
    # that the product goes into 64 bit
    # where it fits easily
    answer1 = 500099950
    answer2 = 100000
    print(correct - answer1 * answer2, correct)
    assert correct == answer1 * answer2

test_dict()

Apple M1 Macbook Pro macOS Monterey 12.5

Compiler Time [s] Relative
LPython (dict06) --fast 0.099 1.0
LPython (dict06) 0.134 1.4
LPython --fast SeparateChaining 0.216 2.2
codon 0.15.5 (codon build -release -exe and time ./executable) 0.257 2.6
LPython SeparateChaining 0.342 3.4
clang++ 13.1.6 arm64-apple-darwin21.6.0 -O3 -funroll-loops -ffast-math 0.979 9.9
clang++ 13.1.6 arm64-apple-darwin21.6.0 -O3 -funroll-loops -ffast-math ModuloHash 1.010 10.2
Python 3.10.4 1.676 16.9
clang++ 13.1.6 arm64-apple-darwin21.6.0 5.512 55.7
clang++ 13.1.6 arm64-apple-darwin21.6.0 ModuloHash 5.646 57.0

Ubuntu 18.04.6 on Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz

Compiler Time [s] Relative
LPython (dict06) --fast 0.372
LPython (dict06) 0.441 1.2
LPython --fast SeparateChaining 0.533 1.4
LPython SeparateChaining 0.629 1.7
clang++ version 6.0.0-1ubuntu2 -O3 -march=native -funroll-loops -ffast-math 0.779 2.1
clang++ version 6.0.0-1ubuntu2 -O3 -march=native -funroll-loops -ffast-math ModuloHash 0.911 2.4
g++ 7.5.0 -O3 -march=native -funroll-loops -ffast-math 1.015 2.7
g++ 7.5.0 -O3 -march=native -funroll-loops -ffast-math ModuloHash 1.185 3.2
Python 3.10.5 2.534 6.8
clang++ version 6.0.0-1ubuntu2 ModuloHash 3.994 10.7
clang++ version 6.0.0-1ubuntu2 4.519 12.1
g++ 7.5.0 ModuloHash 4.567 12.3
g++ 7.5.0 5.270 14.2

Codes

from ltypes import i32, i64

def test_dict():
    number2cpi: dict[str, i64] = {}
    i: i32
    size: i32 = 10000000
    key: str
    c: i32 = 1048576
    correct: i64

    for i in range(1000, 1000 + size, 1):
        key = str(c + i)
        number2cpi[key] = int(i)

    correct = 0
    for i in range(1000, 1000 + size, 1):
        key = str(c + i)
        correct = (correct + number2cpi.pop(key))

    # 50009995000000 doesn't fit into i32
    # as a constant, so splitting it into two
    # parts and saving each into 64 bit so
    # that the product goes into 64 bit
    # where it fits easily
    answer1: i64 = 500099950
    answer2: i64 = 100000
    print(correct - answer1 * answer2, correct)
    assert correct == answer1 * answer2

test_dict()
#include <iostream>
#include <unordered_map>
#include <string>

void test_dict() {
    std::unordered_map<std::string, int64_t> number2cpi;
    int32_t i, size = 10000000;
    std::string key;
    int32_t c = 1048576;
    int64_t correct;

    for (i = 1000; i < 1000 + size; i++) {
        number2cpi[std::to_string(c + i)] = i;
    }

    correct = 0;
    for (i = 1000; i < 1000 + size; i++) {
        key = std::to_string(c + i);
        correct = (correct + number2cpi[key]);
        number2cpi.erase(key);
    }

    /* 50009995000000 doesn't fit into i32
       as a constant, so splitting it into two
       parts and saving each into 64 bit so
       that the product goes into 64 bit
       where it fits easily */
    int64_t answer1 = 500099950;
    int64_t answer2 = 100000;
    std::cout<<(correct - answer1 * answer2)<<" "<<correct<<std::endl;
}

int main() {
    test_dict();
    return 0;
}
#include <unordered_map>
#include <iostream>
#include <string>

uint32_t size = 10000000;

struct PolynomialRollingHash {
    size_t operator() (const std::string& key) const
    {
        const uint64_t p = 31;
        const uint64_t m = 100000009;
        uint64_t hash_value = 0;
        uint64_t p_pow = 1;
        for (char c : key) {
            hash_value = (hash_value + (c - 'a' + 1) * p_pow) % m;
            p_pow = (p_pow * p) % m;
        }
        return hash_value % size;
    }
};

#include <iostream>
#include <unordered_map>
#include <string>

void test_dict() {
    std::unordered_map<std::string, int64_t, PolynomialRollingHash> number2cpi;
    int32_t i;
    std::string key;
    int32_t c = 1048576;
    int64_t correct;

    for (i = 1000; i < 1000 + size; i++) {
        number2cpi[std::to_string(c + i)] = i;
    }

    correct = 0;
    for (i = 1000; i < 1000 + size; i++) {
        key = std::to_string(c + i);
        correct = (correct + number2cpi[key]);
        number2cpi.erase(key);
    }

    /* 50009995000000 doesn't fit into i32
       as a constant, so splitting it into two
       parts and saving each into 64 bit so
       that the product goes into 64 bit
       where it fits easily */
    int64_t answer1 = 500099950;
    int64_t answer2 = 100000;
    std::cout<<(correct - answer1 * answer2)<<" "<<correct<<std::endl;
}

int main() {
    test_dict();
    return 0;
}

codon code

def test_dict():
    number2cpi = {}
    size = 10000000
    c = 1048576

    for i in range(1000, 1000 + size, 1):
        key = str(c + i)
        number2cpi[key] = int(i)

    correct = 0
    for i in range(1000, 1000 + size, 1):
        key = str(c + i)
        correct = (correct + number2cpi.pop(key))

    # 50009995000000 doesn't fit into i32
    # as a constant, so splitting it into two
    # parts and saving each into 64 bit so
    # that the product goes into 64 bit
    # where it fits easily
    answer1 = 500099950
    answer2 = 100000
    print(correct - answer1 * answer2, correct)
    assert correct == answer1 * answer2

test_dict()

Apple M1 Macbook Pro macOS Monterey 12.5

Compiler Time [s] Relative
codon 0.15.5 (codon build -release -exe and time ./executable) 2.675 0.11
LPython --fast SeparateChaining 4.831 0.4
LPython SeparateChaining 5.048 0.5
clang++ 13.1.6 arm64-apple-darwin21.6.0 -O3 -funroll-loops -ffast-math PolynomialRollingHash 5.522 0.5
Python 3.10.4 6.050 0.6
LPython (dict08) --fast tripling 6.420 0.6
LPython (dict08) tripling 7.596 0.7
clang++ 13.1.6 arm64-apple-darwin21.6.0 -O3 -funroll-loops -ffast-math 7.608 0.7
LPython (dict06) --fast 10.784 1.0
LPython (dict06) 11.831 1.0
clang++ 13.1.6 arm64-apple-darwin21.6.0 PolynomialRollingHash 14.363 1.3
clang++ 13.1.6 arm64-apple-darwin21.6.0 16.477 1.5

codon output

18:46:31:~/lpython_project % time ./dict_bench
0 50009995000000
./dict_bench  3.19s user 0.09s system 122% cpu 2.675 total

Ubuntu 18.04.6 on Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz

Compiler Time [s] Relative
LPython --fast SeparateChaining 8.323 0.3
LPython SeparateChaining 8.436 0.3
Python 3.10.5 9.426 0.4
clang++ version 6.0.0-1ubuntu2 -O3 -march=native -funroll-loops -ffast-math PolynomialRollingHash 11.293 0.5
g++ 7.5.0 -O3 -march=native -funroll-loops -ffast-math PolynomialRollingHash 11.517 0.5
g++ 7.5.0 -O3 -march=native -funroll-loops -ffast-math 12.147 0.5
clang++ version 6.0.0-1ubuntu2 -O3 -march=native -funroll-loops -ffast-math 12.215 0.5
LPython (dict08) --fast tripling 12.415 0.5
LPython (dict08) tripling 12.963 0.5
clang++ version 6.0.0-1ubuntu2 18.36 0.75
g++ 7.5.0 18.985 0.78
g++ 7.5.0 PolynomialRollingHash 20.422 0.8
clang++ version 6.0.0-1ubuntu2 PolynomialRollingHash 21.463 0.88
LPython (dict06) --fast 24.493 1.0
LPython (dict06) 27.328 1.1
@certik
Copy link

certik commented Aug 18, 2022

On my M1 Max I get:

$ lpython a.py --fast
$ time ./a.out
50001499500000.00000000000000000 100000001
./a.out  0.42s user 0.26s system 96% cpu 0.703 total

and

$ lpython a.py 
$ time ./a.out
50001499500000.00000000000000000 100000001
./a.out  1.29s user 0.25s system 98% cpu 1.575 total

and

$ clang++ -O3 -funroll-loops -std=c++20 -ffast-math b.cpp
$ time ./a.out
5.00015e+13./a.out  3.60s user 0.58s system 99% cpu 4.192 total

I also tried std::map, then I got 14.2s.

The following code:

#include <unordered_map>
#include <iostream>

void test_dict() {
    std::unordered_map<int32_t, double> rollnumber2cpi;
    int32_t i, size = 100000000 + 1000;
    double total = 0;
    rollnumber2cpi.reserve(size);

    for(i = 1000; i < 1000 + size; i++) {
        rollnumber2cpi[i] = double(i/100.0 + 5.0);
    }

    for(i = 1000; i < 1000 + size; i++) {
        total += rollnumber2cpi[i];
    }

    std::cout<<total;
}

int main() {
    test_dict();
}

Runs at 3.915s.

The ModuloHash above gives me:

$ clang++ -O3 -funroll-loops -std=c++20 -ffast-math d.cpp 
$ time ./a.out 
5.00015e+13 100000000./a.out  3.38s user 0.47s system 98% cpu 3.890 total

@certik
Copy link

certik commented Sep 9, 2022

One has to do number2cpi.reserve(size); in the C++ code. Then I am getting:

$ time ./a.out
0 50009995000000
./a.out  3.70s user 0.12s system 99% cpu 3.839 total

instead of:

$ time ./a.out 
0 50009995000000
./a.out  5.74s user 0.15s system 99% cpu 5.914 total

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment