ZhouXing19/n2-standard-8.sh

## n2-standard-8.sh
#!/bin/bash

NAME_EXTRA=${NAME_EXTRA:=ori}

CLOUD="gce"
CLUSTER="$CRL_USERNAME-cldrprt23-n2-standard-8-4140608692-$NAME_EXTRA"
TMUX_SESSION="cloud-report"
WEST_CLUSTER="${CLUSTER}-west"
west_cluster_created=''

# If env var NODES is not specified, set NODES to 4.
NODES=${NODES:=4}

TPCC_WAREHOURSE_PER_VCPU=${TPCC_WAREHOURSE_PER_VCPU:=125}

# We start different ports for testserver for the cross-region and intra-az network test.
CROSS_REGION_PORT=12865
INTER_AZ_PORT=1337

set -ex
scriptName=$(basename ${0%.*})
logdir="$(dirname $0)/../logs/${scriptName}"
mkdir -p "$logdir"

# Redirect stdout and stderr into script log file
exec &> >(tee -a "$logdir/driver-$NAME_EXTRA.log")

# Create roachprod cluster
function create_cluster() {
  roachprod create "$CLUSTER" -n $NODES --lifetime "20h" --clouds "$CLOUD" \
    --$CLOUD-machine-type "n2-standard-8" --gce-zones="us-east4-c"  --gce-pd-volume-size="1000"  --gce-pd-volume-type="pd-ssd" --gce-min-cpu-platform="Intel Cascade Lake" --local-ssd="false" --gce-image="ubuntu-2004-focal-v20210927" \
    --label usage=cloud-report-2022

  roachprod run "$CLUSTER" -- tmux new -s "$TMUX_SESSION" -d
}

# Create roachprod in us-west2
function create_west_cluster() {
  roachprod create "$WEST_CLUSTER" -u $USER -n 1 --lifetime "20h" --clouds "$CLOUD" \
    --$CLOUD-machine-type "n2-standard-8" --gce-zones="us-west1-a"  --gce-pd-volume-size="1000"  --gce-pd-volume-type="pd-ssd" --gce-min-cpu-platform="Intel Cascade Lake" --local-ssd="false" --gce-image="ubuntu-2004-focal-v20210927" \
    --label usage=cloud-report-2022

  roachprod run "$WEST_CLUSTER" -- tmux new -s "$TMUX_SESSION" -d
  west_cluster_created=true
}

# Upload scripts to roachprod cluster
function upload_scripts() {
  roachprod run "$1" rm  -- -rf ./scripts
  roachprod put "$1" ./scripts scripts
  echo "n2-standard-8" > "machinetype.txt"
  roachprod put "$1" "machinetype.txt" "machinetype.txt"
  roachprod run "$1" chmod -- -R +x ./scripts
  roachprod put "$1" ./netperf ./netperf
  roachprod run "$1" chmod -- -R +x ./netperf
}

# Load the cockroach binary to roachprod cluster
function load_cockroach() {
  roachprod run "$1" "rm -f ./cockroach"
  if [ -z "$cockroach_binary" ]
  then
    cockroach_version=$(curl -s -i https://edge-binaries.cockroachdb.com/cockroach/cockroach.linux-gnu-amd64.LATEST |grep location|awk -F"/" '{print $NF}')
    echo "WARN: staging a stable cockroach binary from master with hash: 5ac733bb4927020bc1c52da24b2591742fde8e1f"
    roachprod stage "$1" cockroach
  elif [[ $cockroach_binary =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
    echo "INFO: staging release version $cockroach_binary of cockroach binary"
    roachprod stage "$1" release "$cockroach_binary"
  else
    echo "WARN: staging unknown version of cockroach binary from local path: $cockroach_binary"
    roachprod put "$1" "$cockroach_binary" "cockroach"
  fi
}

# Start cockroach cluster on nodes [1-NODES-1].
function start_cockroach() {
  # Build --store flags based on the number of disks.
  # Roachprod adds /mnt/data1/cockroach by itself, so, we'll pick up the other disks
  for s in $(roachprod run "$CLUSTER":1 'ls -1d /mnt/data[2-9]* 2>/dev/null || echo')
  do
   stores="$stores --store $s/cockroach"
  done

  if [[ -z $stores ]]; then
    stores="--store=/mnt/data1/cockroach"
  fi

  if [[ $NODES == 2 ]]; then
  	roachprod start "$CLUSTER":1 --args="$stores --cache=0.25 --max-sql-memory=0.4" --num-files-limit=512000
  else
  	roachprod start "$CLUSTER":1-$((NODES-1)) --args="$stores --cache=0.25 --max-sql-memory=0.4" --num-files-limit=512000
  fi
}

# Execute setup.sh script on the cluster to configure it
function setup_cluster() {
	roachprod run "$1" sudo ./scripts/gen/setup.sh "$CLOUD"
	roachprod run "$1":1 -- cpufetch -s legacy|awk -F"@" '{print $NF}'|tr -d ' '|awk NF > "$logdir"/"$1"_cpu_info.txt
    roachprod run "$1":1 -- lscpu |grep "MHz" >> "$logdir"/"$1"_cpu_info.txt
}

# executes command on a host using roachprod, under tmux session.
function run_under_tmux() {
  local name=$1
  local host=$2
  local cmd=$3
  roachprod run $host -- tmux neww -t "$TMUX_SESSION" -n "$name" -d -- "$cmd"
}

# Benchmark scripts should execute a single benchmark
# and download results to the $logdir directory.
# results_dir returns date suffixed directory under logdir.
#
function results_dir() {
  echo "$logdir/$1.$(date +%Y%m%d.%T)-$NAME_EXTRA"
}

function copy_result_with_retry() {
  # There is a random roachprod issue that we recently identified:
  # After a test finished successfully in a host, the following "roachprod get"
  # command didn't copy the result files correctly from the host node to the
  # client, so the target result directory ended up with all empty files.
  # This function will copy result files from host to client and check that
  # there is no empty result file in the target directory. It will retry the
  # copy step if the previous copy failed with any empty result files, the test
  # will fail if after retry it still couldn't copy file correctly.
  #
  target_dir=$(results_dir "$2")
  for i in {1..3}
  do
    roachprod get "$1" "./$2" "$target_dir"

    result_files=$(find "$target_dir" -empty -type f -name "*.log")
    if [ -z "$result_files" ]
    then
      echo "Test passed!"
      break
    fi
    echo "Copy file round "$i" failed, found empty result file(s):\n$result"
    sleep 5s
  done

  if [ ! -z "$result_files" ]
  then
    echo "Copy failed with empty result file(s) in "$target_dir", test failed!"
  fi

  if [ "$2" == "tpcc-results" ]
  then
    result_files=$(find "$target_dir" -type f -name "*.txt")
    for result_file in $result_files
    do
      prev_line=$(tail -2 "$result_file")
      if [[ "$prev_line" != *efc* ]] || [[ $(tail -1 "$result_file" | awk '{if(int($3) > 87){print "pass"}}') != "pass" ]];
      then
      	# Instead of deleting invalid result files, we rename them for auditing
      	# and validation purpose.
        mv $result_file "$result_file.bak"
      fi
    done
  fi
}

# Run CPU benchmark
function bench_cpu() {
  run_under_tmux "cpu" "$CLUSTER:1"  "./scripts/gen/cpu.sh $cpu_extra_args"
}

# Wait for CPU benchmark to finish and retrieve results.
function fetch_bench_cpu_results() {
  node="$CLUSTER":1
  roachprod run $node ./scripts/gen/cpu.sh -- -w
  copy_result_with_retry $node "coremark-results"
}

# Run FIO benchmark
function bench_io() {
  run_under_tmux "io" "$CLUSTER:1" "./scripts/gen/fio.sh $io_extra_args"
}

# Wait for FIO benchmark top finish and retrieve results.
function fetch_bench_io_results() {
  node="$CLUSTER":1
  roachprod run $node ./scripts/gen/fio.sh -- -w
  copy_result_with_retry $node "fio-results"
}

# Run IO Fsync benchmark
function bench_iofsync() {
  run_under_tmux "iofsync" "$CLUSTER:1" "./scripts/gen/fio_fsync.sh $iofsync_extra_args"
}

# Wait for IO FSync benchmark top finish and retrieve results.
function fetch_bench_iofsync_results() {
  roachprod run "$CLUSTER":1 ./scripts/gen/fio_fsync.sh -- -w
  roachprod get "$CLUSTER":1 ./fio-fsync-results $(results_dir "fio-fsync-results")
}

# Wait for Netperf benchmark to complete and fetch results.
function fetch_bench_net_results() {
  if [ $NODES -lt 2 ]
  then
    echo "NODES must be greater than 1 for this test"
    exit 1
  fi

  target_dir=$(results_dir "netperf-results")
  if [ $NODES -eq 2 ]
  then
    node="$CLUSTER":1
  else
    node="$CLUSTER":3
  fi
  roachprod run $node ./scripts/gen/network-netperf.sh -- -w
  copy_result_with_retry $node "netperf-results"
}

# Run TPCC Benchmark
function bench_tpcc() {
  if [ $NODES -lt 2 ]; then
    echo "NODES must be greater than 1 for this test"
    exit 1
  fi

  if [[ -z $TPCC_WAREHOURSE_PER_VCPU ]]; then
    echo "env var TPCC_WAREHOURSE_PER_VCPU must not be set empty"
    exit 1
  fi

  start_cockroach
  if [ $NODES -eq 2 ]; then
    pgurls=$(roachprod pgurl "$CLUSTER":1)
    run_under_tmux "tpcc" "$CLUSTER:2" "./scripts/gen/tpcc.sh -a $TPCC_WAREHOURSE_PER_VCPU $tpcc_extra_args ${pgurls[@]}"
  else
    pgurls=$(roachprod pgurl "$CLUSTER":1-$((NODES-1)))
    run_under_tmux "tpcc" "$CLUSTER:$NODES" "./scripts/gen/tpcc.sh -a $TPCC_WAREHOURSE_PER_VCPU $tpcc_extra_args ${pgurls[@]}"
  fi
}

function fetch_bench_tpcc_results() {
  if [ $NODES -lt 2 ]
  then
    echo "NODES must be greater than 1 for this test"
    exit 1
  fi

  node="$CLUSTER":$NODES

  # Don't exist if the following section gives error.
  set +e
  roachprod run $node ./scripts/gen/tpcc.sh -- -w
  copy_result_with_retry $node "tpcc-results"
  set -e
}

# modify_remote_hosts_on_client_node is to get the ip from the remote node,
# write it into a local file, and mount it to the netperf/doc/examples folder
# in the client node.
function modify_remote_hosts_on_client_node() {
  # client_node is the one to run TCP_RR and TCP_STREAM.
  local client_node=$1
  local server_node=$2
  # test_mode should be either cross-region or intra-az.
  local test_mode=$3

  local server_ip=$(roachprod ip "$server_node")
  if [ -z $server_ip ]
  then
    echo "cannot get server_ip FOR server (remote) node ($server_node) in network test"
    exit 1
  fi

  # Since linux doesn't allow ":" in filename, we replace the ":" in
  # $server_node to "-".
  formatted_server_node=$(echo "${server_node//:/-}")
  echo "formatted_server_node=$formatted_server_node"

  # Save the ip address of the server node into the
  # netperf/doc/examples/remote_hosts in the client node.
  local remote_host_file="${logdir}/${formatted_server_node}_${test_mode}_remote_hosts"
  printf "REMOTE_HOSTS[0]=$server_ip\nREMOTE_HOSTS[1]=$server_ip\nNUM_REMOTE_HOSTS=2\n" >"$remote_host_file"
  chmod 777 "$remote_host_file"
  roachprod run "$client_node" -- sudo chmod 777 -R netperf
  roachprod put "$client_node" "$remote_host_file" netperf/doc/examples/${test_mode}_remote_hosts
}

# get_best_number_streams is to run a netperf TCP_STREAM test with
# gradually incrementing the number of streams until the aggregate throughput
# converges (the latest 3 agg throughput's std < 0.3). The best number of
# streams will be saved in a file "num_streams" in the client node.
function get_best_number_streams() {
  local client_node=$1
  local test_mode=$2
  echo "running getting best num of stream for $client_node"

  roachprod run "$client_node" -- "cd netperf/doc/examples && SEARCH_BEST_NUM_STREAMS=1 TEST_MODE=$test_mode ./runemomniaggdemo.sh"
  echo "get best number of stream for $client_node"
}

# run_netperf_between_server_client is to get the best number of streams
# to run the throughput test between the server and client node, and run the
# netperf latency and throughput test. We start the netserver on the server node
# and run netperf command on the client node.
# Note that in the cross-region case, we set the east node as the client node,
# and the west node as the server node.
function run_netperf_between_server_client() {

  local client_node=$1
  local server_node=$2

  local PORT=$3
  local test_mode=$4
  local netperf_extra_args=$5

  local server_ip=$(roachprod ip $server_node)

  roachprod run $client_node sudo ./scripts/gen/network-setup.sh
  roachprod run $server_node sudo ./scripts/gen/network-setup.sh

  # Start netserver on the server node.
  # It may give error, but it only means that the netserver is already running
  # on the given port, so we should proceed when that happens.
  set +e
  roachprod run $server_node ./scripts/gen/network-test.sh -- -S -p $PORT -m $server_node
  set -e

  # Mount a file containing server's ip to the client node.
  modify_remote_hosts_on_client_node $client_node $server_node $test_mode
  get_best_number_streams $client_node $test_mode
  run_under_tmux "${test_mode}-net" $client_node "./scripts/gen/network-test.sh -s $server_ip -p $PORT -m $test_mode -z $CLOUD-n2-standard-8 $netperf_extra_args"
}

# Run intra-az Netperf benchmark. The test will be run the 1st and the 2nd
# nodes of the same cluster.
function bench_intra_az_net() {
  if [ $NODES -lt 2 ]
  then
    echo "NODES must be greater than 1 for this test"
    exit 1
  fi

  local server_node="$CLUSTER":2
  local client_node="$CLUSTER":1

  run_netperf_between_server_client $client_node $server_node $INTER_AZ_PORT intra-az "$net_extra_args"
}

# Wait for Netperf benchmark to complete and fetch results.
function fetch_bench_intra_az_net_results() {
  if [ $NODES -lt 2 ]
  then
    echo "NODES must be greater than 1 for this test"
    exit 1
  fi

  roachprod run ${CLUSTER}:1 ./scripts/gen/network-test.sh -- -w -m intra-az
  roachprod get ${CLUSTER}:1 ./intra-az-netperf-results $(results_dir "intra-az-netperf-results")
}

# bench_cross_region_net is run the cross-region network tests.
function bench_cross_region_net() {
  create_west_cluster
  upload_scripts "$WEST_CLUSTER"
  setup_cluster "$WEST_CLUSTER"

  run_netperf_between_server_client ${CLUSTER}:1 ${WEST_CLUSTER}:1 $CROSS_REGION_PORT cross-region $cross_region_net_extra_args
}

# fetch_bench_cross_region_net_results is to wait the cross-region network test
# to finish and the fetch the results from the server node.
function fetch_bench_cross_region_net_results() {
  roachprod run ${CLUSTER}:1 ./scripts/gen/network-test.sh -- -w -m cross-region
  roachprod get ${CLUSTER}:1 ./cross-region-netperf-results $(results_dir "cross-region-netperf-results")
}

# Destroy roachprod cluster
function destroy_cluster() {
  roachprod destroy "$CLUSTER"
  if [[ -n $west_cluster_created ]]; then
    roachprod destroy "$WEST_CLUSTER"
  fi
}

function usage() {
echo "$1
Usage: $0 [-b <bootstrap>]... [-w <workload>]... [-d] [-c cockroach_binary]
   -b: One or more bootstrap steps.
         -b create: creates cluster
         -b upload: uploads required scripts
         -b setup: execute setup script on the cluster
         -b all: all of the above steps
   -w: Specify workloads (benchmarks) to execute.
       -w cpu : Benchmark CPU
       -w io  : Benchmark IO
       -w iofsync : Benchmark IO Fsync
       -w ia_net : Benchmark Net. Please don't run "ia_net" and "cr_net" on the same cluster.
       -w cr_net : Benchmark Cross-region Net. Please don't run "ia_net" and "cr_net" on the same cluster.
       -w tpcc: Benchmark TPCC
       -w all : All of the above
   -c: Override cockroach binary to stage (local path to binary or release version)
   -r: Do not start benchmarks specified by -w.  Instead, resume waiting for their completion.
   -I: additional IO benchmark arguments
   -F: additional IO Fsync benchmark arguments
   -N: additional network benchmark arguments
   -C: additional CPU benchmark arguments
   -T: additional TPCC benchmark arguments
   -R: additional cross-region network benchmark arguments
   -n: override number of nodes in a cluster
   -d: Destroy cluster
"
exit 1
}

benchmarks=()
f_resume=''
do_create=''
do_upload=''
do_setup=''
do_destroy=''
io_extra_args=''
iofsync_extra_args=''
cpu_extra_args=''
tpcc_extra_args=' -L "--provider-override=gs --bucket-override=cloud-report-tpcc" '
intra_az_net_extra_args=''
cross_region_net_extra_args=''
cockroach_binary=''

while getopts 'c:b:w:dn:I:F:N:C:T:R:r' flag; do
  case "${flag}" in
    b) case "${OPTARG}" in
        all)
          do_create='true'
          do_upload='true'
          do_setup='true'
          do_cockroach='true'
        ;;
        create)    do_create='true' ;;
        upload)    do_upload='true' ;;
        setup)     do_setup='true' ;;
        *) usage "Invalid -b value '${OPTARG}'" ;;
       esac
    ;;
    c) cockroach_binary="${OPTARG}" ;;
    w) case "${OPTARG}" in
         cpu) benchmarks+=("bench_cpu") ;;
         io) benchmarks+=("bench_io") ;;
         iofsync) benchmarks+=("bench_iofsync") ;;
         ia_net) benchmarks+=("bench_intra_az_net") ;;
         cr_net) benchmarks+=("bench_cross_region_net") ;;
         tpcc) benchmarks+=("bench_tpcc") ;;
         all) benchmarks+=("bench_cpu" "bench_io" "bench_io_fsync" "bench_tpcc" "bench_cross_region_net") ;;
         *) usage "Invalid -w value '${OPTARG}'";;
       esac
    ;;
    d) do_destroy='true' ;;
    r) f_resume='true' ;;
    n) NODES="${OPTARG}" ;;
    I) io_extra_args="${OPTARG}" ;;
    F) iofsync_extra_args="${OPTARG}" ;;
    C) cpu_extra_args="${OPTARG}" ;;
    T) tpcc_extra_args="${OPTARG}" ;;
    N) intra_az_net_extra_args="${OPTARG}" ;;
    R) cross_region_net_extra_args="${OPTARG}" ;;
    *) usage ;;
  esac
done

if [ -n "$do_create" ];
then
  create_cluster
fi

if [ -n "$do_upload" ];
then
  upload_scripts $CLUSTER
  load_cockroach $CLUSTER
fi

if [ -n "$do_setup" ];
then
  setup_cluster $CLUSTER
fi

if [ -z "$f_resume" ]
then
  # Execute requested benchmarks.
  for bench in "${benchmarks[@]}"
  do
    $bench
  done
fi

# Wait for benchmarks to finsh and fetch their results.
for bench in "${benchmarks[@]}"
do
  echo "Waiting for $bench to complete"
  fetch="fetch_${bench}_results"
  $fetch
done

if [ -n "$do_destroy" ];
then
  destroy_cluster
fi

## tpcc.sh
#!/bin/bash

set -ex
pidfile="$HOME/tpcc-bench.pid"
f_force=''
f_wait=''
f_active=0
f_warehouses=10000
f_active_per_core=125
f_skip_load=''
f_load_args=''
f_duration="30m"

function usage() {
  echo "$1
Usage: $0 [-f] [-w] [-s server] [pgurl,...]
  -f: ignore existing pid file; override and rerun.
  -w: wait for currently running benchmark to complete.
  -W: number of warehouses; default 2500
  -A: number of starting active warehouses
  -s: skip loading stage
  -L: extra args for load
  -a: number of active warehouses per core
  -d: duration; default 30m
"
  exit 1
}

while getopts 'fwsW:A:d:a:L:' flag; do
  case "${flag}" in
    f) f_force='true' ;;
    w) f_wait='true' ;;
    W) f_warehouses="${OPTARG}" ;;
    A) f_active="${OPTARG}" ;;
    a) f_active_per_core="${OPTARG}" ;;
    s) f_skip_load='true' ;;
    d) f_duration="${OPTARG}" ;;
    L) f_load_args="${OPTARG}" ;;
    *) usage "";;
  esac
done

logdir="$HOME/tpcc-results"

if [ -n "$f_wait" ];
then
  exec sh -c "
    ( test -f '$logdir/success' ||
      (test -f \"$pidfile\" && tail --pid \"$(cat $pidfile)\" -f /dev/null && test -f '$logdir/success')
    ) || (echo 'TPC-C benchmark did not complete successfully.  Check logs'; exit 1)"
fi

echo "f_load_args:[$f_load_args]"
echo "f_active_per_core:[$f_active_per_core]"


if [ -f "$pidfile" ] && [ -z "$f_force" ];
then
  pid=$(cat "$pidfile")
  echo "TPCC benchmark already running (pid $pid)"
  exit
fi

shift $((OPTIND - 1 ))
pgurls=("$@")

if [[ ${#pgurls[@]} == 0 ]];
then
  usage "list of pgurls required"
fi

trap "rm -f $pidfile" EXIT SIGINT
echo $$ > "$pidfile"

rm -rf "$logdir"
mkdir "$logdir"
exec &> >(tee -a "$logdir/script.log")

cd "$HOME"

if [ -z "$f_skip_load" ]
then
  #./cockroach sql --insecure --url "${pgurls[0]}" -e "
  # SET CLUSTER SETTING kv.snapshot_recovery.max_rate = '512 MiB';
  # SET CLUSTER SETTING kv.snapshot_rebalance.max_rate = '512 MiB';
	# SET CLUSTER SETTING admission.kv.enabled,=false;
	# SET CLUSTER SETTING admission.sql_kv_response.enabled=false;
	# SET CLUSTER SETTING admission.sql_sql_response.enabled=false;
  #";
  echo "Loading TPCC fixture for $f_warehouses warehouses ..."
  ./cockroach workload fixtures load tpcc --checks=false --warehouses="$f_warehouses" $f_load_args "${pgurls[0]}"
  echo "done loading"
fi

if (( f_active == 0 ))
then
  # Scale active warehouse count by f_active_per_core * number of CPUs.
  f_active=$(( f_active_per_core * $(cat /proc/cpuinfo | grep processor | wc -l) ))
  if (( f_active > f_warehouses ))
  then
    f_active=0
  fi
fi

report="${logdir}/tpcc-results-$f_active.txt"
./cockroach workload run tpcc \
  --warehouses="$f_warehouses" --active-warehouses="$f_active" --ramp=5m --duration="$f_duration" --tolerate-errors --wait=0 \
  "${pgurls[@]}" > "$report"
	#!/bin/bash

	NAME_EXTRA=${NAME_EXTRA:=ori}

	CLOUD="gce"
	CLUSTER="$CRL_USERNAME-cldrprt23-n2-standard-8-4140608692-$NAME_EXTRA"
	TMUX_SESSION="cloud-report"
	WEST_CLUSTER="${CLUSTER}-west"
	west_cluster_created=''

	# If env var NODES is not specified, set NODES to 4.
	NODES=${NODES:=4}

	TPCC_WAREHOURSE_PER_VCPU=${TPCC_WAREHOURSE_PER_VCPU:=125}

	# We start different ports for testserver for the cross-region and intra-az network test.
	CROSS_REGION_PORT=12865
	INTER_AZ_PORT=1337

	set -ex
	scriptName=$(basename ${0%.*})
	logdir="$(dirname $0)/../logs/${scriptName}"
	mkdir -p "$logdir"

	# Redirect stdout and stderr into script log file
	exec &> >(tee -a "$logdir/driver-$NAME_EXTRA.log")

	# Create roachprod cluster
	function create_cluster() {
	roachprod create "$CLUSTER" -n $NODES --lifetime "20h" --clouds "$CLOUD" \
	--$CLOUD-machine-type "n2-standard-8" --gce-zones="us-east4-c" --gce-pd-volume-size="1000" --gce-pd-volume-type="pd-ssd" --gce-min-cpu-platform="Intel Cascade Lake" --local-ssd="false" --gce-image="ubuntu-2004-focal-v20210927" \
	--label usage=cloud-report-2022

	roachprod run "$CLUSTER" -- tmux new -s "$TMUX_SESSION" -d
	}

	# Create roachprod in us-west2
	function create_west_cluster() {
	roachprod create "$WEST_CLUSTER" -u $USER -n 1 --lifetime "20h" --clouds "$CLOUD" \
	--$CLOUD-machine-type "n2-standard-8" --gce-zones="us-west1-a" --gce-pd-volume-size="1000" --gce-pd-volume-type="pd-ssd" --gce-min-cpu-platform="Intel Cascade Lake" --local-ssd="false" --gce-image="ubuntu-2004-focal-v20210927" \
	--label usage=cloud-report-2022

	roachprod run "$WEST_CLUSTER" -- tmux new -s "$TMUX_SESSION" -d
	west_cluster_created=true
	}

	# Upload scripts to roachprod cluster
	function upload_scripts() {
	roachprod run "$1" rm -- -rf ./scripts
	roachprod put "$1" ./scripts scripts
	echo "n2-standard-8" > "machinetype.txt"
	roachprod put "$1" "machinetype.txt" "machinetype.txt"
	roachprod run "$1" chmod -- -R +x ./scripts
	roachprod put "$1" ./netperf ./netperf
	roachprod run "$1" chmod -- -R +x ./netperf
	}

	# Load the cockroach binary to roachprod cluster
	function load_cockroach() {
	roachprod run "$1" "rm -f ./cockroach"
	if [ -z "$cockroach_binary" ]
	then
	cockroach_version=$(curl -s -i https://edge-binaries.cockroachdb.com/cockroach/cockroach.linux-gnu-amd64.LATEST \|grep location\|awk -F"/" '{print $NF}')
	echo "WARN: staging a stable cockroach binary from master with hash: 5ac733bb4927020bc1c52da24b2591742fde8e1f"
	roachprod stage "$1" cockroach
	elif [[ $cockroach_binary =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
	echo "INFO: staging release version $cockroach_binary of cockroach binary"
	roachprod stage "$1" release "$cockroach_binary"
	else
	echo "WARN: staging unknown version of cockroach binary from local path: $cockroach_binary"
	roachprod put "$1" "$cockroach_binary" "cockroach"
	fi
	}

	# Start cockroach cluster on nodes [1-NODES-1].
	function start_cockroach() {
	# Build --store flags based on the number of disks.
	# Roachprod adds /mnt/data1/cockroach by itself, so, we'll pick up the other disks
	for s in $(roachprod run "$CLUSTER":1 'ls -1d /mnt/data[2-9]* 2>/dev/null \|\| echo')
	do
	stores="$stores --store $s/cockroach"
	done

	if [[ -z $stores ]]; then
	stores="--store=/mnt/data1/cockroach"
	fi

	if [[ $NODES == 2 ]]; then
	roachprod start "$CLUSTER":1 --args="$stores --cache=0.25 --max-sql-memory=0.4" --num-files-limit=512000
	else
	roachprod start "$CLUSTER":1-$((NODES-1)) --args="$stores --cache=0.25 --max-sql-memory=0.4" --num-files-limit=512000
	fi
	}

	# Execute setup.sh script on the cluster to configure it
	function setup_cluster() {
	roachprod run "$1" sudo ./scripts/gen/setup.sh "$CLOUD"
	roachprod run "$1":1 -- cpufetch -s legacy\|awk -F"@" '{print $NF}'\|tr -d ' '\|awk NF > "$logdir"/"$1"_cpu_info.txt
	roachprod run "$1":1 -- lscpu \|grep "MHz" >> "$logdir"/"$1"_cpu_info.txt
	}

	# executes command on a host using roachprod, under tmux session.
	function run_under_tmux() {
	local name=$1
	local host=$2
	local cmd=$3
	roachprod run $host -- tmux neww -t "$TMUX_SESSION" -n "$name" -d -- "$cmd"
	}

	# Benchmark scripts should execute a single benchmark
	# and download results to the $logdir directory.
	# results_dir returns date suffixed directory under logdir.
	#
	function results_dir() {
	echo "$logdir/$1.$(date +%Y%m%d.%T)-$NAME_EXTRA"
	}

	function copy_result_with_retry() {
	# There is a random roachprod issue that we recently identified:
	# After a test finished successfully in a host, the following "roachprod get"
	# command didn't copy the result files correctly from the host node to the
	# client, so the target result directory ended up with all empty files.
	# This function will copy result files from host to client and check that
	# there is no empty result file in the target directory. It will retry the
	# copy step if the previous copy failed with any empty result files, the test
	# will fail if after retry it still couldn't copy file correctly.
	#
	target_dir=$(results_dir "$2")
	for i in {1..3}
	do
	roachprod get "$1" "./$2" "$target_dir"

	result_files=$(find "$target_dir" -empty -type f -name "*.log")
	if [ -z "$result_files" ]
	then
	echo "Test passed!"
	break
	fi
	echo "Copy file round "$i" failed, found empty result file(s):\n$result"
	sleep 5s
	done

	if [ ! -z "$result_files" ]
	then
	echo "Copy failed with empty result file(s) in "$target_dir", test failed!"
	fi

	if [ "$2" == "tpcc-results" ]
	then
	result_files=$(find "$target_dir" -type f -name "*.txt")
	for result_file in $result_files
	do
	prev_line=$(tail -2 "$result_file")
	if [[ "$prev_line" != efc ]] \|\| [[ $(tail -1 "$result_file" \| awk '{if(int($3) > 87){print "pass"}}') != "pass" ]];
	then
	# Instead of deleting invalid result files, we rename them for auditing
	# and validation purpose.
	mv $result_file "$result_file.bak"
	fi
	done
	fi
	}

	# Run CPU benchmark
	function bench_cpu() {
	run_under_tmux "cpu" "$CLUSTER:1" "./scripts/gen/cpu.sh $cpu_extra_args"
	}

	# Wait for CPU benchmark to finish and retrieve results.
	function fetch_bench_cpu_results() {
	node="$CLUSTER":1
	roachprod run $node ./scripts/gen/cpu.sh -- -w
	copy_result_with_retry $node "coremark-results"
	}

	# Run FIO benchmark
	function bench_io() {
	run_under_tmux "io" "$CLUSTER:1" "./scripts/gen/fio.sh $io_extra_args"
	}

	# Wait for FIO benchmark top finish and retrieve results.
	function fetch_bench_io_results() {
	node="$CLUSTER":1
	roachprod run $node ./scripts/gen/fio.sh -- -w
	copy_result_with_retry $node "fio-results"
	}

	# Run IO Fsync benchmark
	function bench_iofsync() {
	run_under_tmux "iofsync" "$CLUSTER:1" "./scripts/gen/fio_fsync.sh $iofsync_extra_args"
	}

	# Wait for IO FSync benchmark top finish and retrieve results.
	function fetch_bench_iofsync_results() {
	roachprod run "$CLUSTER":1 ./scripts/gen/fio_fsync.sh -- -w
	roachprod get "$CLUSTER":1 ./fio-fsync-results $(results_dir "fio-fsync-results")
	}

	# Wait for Netperf benchmark to complete and fetch results.
	function fetch_bench_net_results() {
	if [ $NODES -lt 2 ]
	then
	echo "NODES must be greater than 1 for this test"
	exit 1
	fi

	target_dir=$(results_dir "netperf-results")
	if [ $NODES -eq 2 ]
	then
	node="$CLUSTER":1
	else
	node="$CLUSTER":3
	fi
	roachprod run $node ./scripts/gen/network-netperf.sh -- -w
	copy_result_with_retry $node "netperf-results"
	}

	# Run TPCC Benchmark
	function bench_tpcc() {
	if [ $NODES -lt 2 ]; then
	echo "NODES must be greater than 1 for this test"
	exit 1
	fi

	if [[ -z $TPCC_WAREHOURSE_PER_VCPU ]]; then
	echo "env var TPCC_WAREHOURSE_PER_VCPU must not be set empty"
	exit 1
	fi

	start_cockroach
	if [ $NODES -eq 2 ]; then
	pgurls=$(roachprod pgurl "$CLUSTER":1)
	run_under_tmux "tpcc" "$CLUSTER:2" "./scripts/gen/tpcc.sh -a $TPCC_WAREHOURSE_PER_VCPU $tpcc_extra_args ${pgurls[@]}"
	else
	pgurls=$(roachprod pgurl "$CLUSTER":1-$((NODES-1)))
	run_under_tmux "tpcc" "$CLUSTER:$NODES" "./scripts/gen/tpcc.sh -a $TPCC_WAREHOURSE_PER_VCPU $tpcc_extra_args ${pgurls[@]}"
	fi
	}

	function fetch_bench_tpcc_results() {
	if [ $NODES -lt 2 ]
	then
	echo "NODES must be greater than 1 for this test"
	exit 1
	fi

	node="$CLUSTER":$NODES

	# Don't exist if the following section gives error.
	set +e
	roachprod run $node ./scripts/gen/tpcc.sh -- -w
	copy_result_with_retry $node "tpcc-results"
	set -e
	}

	# modify_remote_hosts_on_client_node is to get the ip from the remote node,
	# write it into a local file, and mount it to the netperf/doc/examples folder
	# in the client node.
	function modify_remote_hosts_on_client_node() {
	# client_node is the one to run TCP_RR and TCP_STREAM.
	local client_node=$1
	local server_node=$2
	# test_mode should be either cross-region or intra-az.
	local test_mode=$3

	local server_ip=$(roachprod ip "$server_node")
	if [ -z $server_ip ]
	then
	echo "cannot get server_ip FOR server (remote) node ($server_node) in network test"
	exit 1
	fi

	# Since linux doesn't allow ":" in filename, we replace the ":" in
	# $server_node to "-".
	formatted_server_node=$(echo "${server_node//:/-}")
	echo "formatted_server_node=$formatted_server_node"

	# Save the ip address of the server node into the
	# netperf/doc/examples/remote_hosts in the client node.
	local remote_host_file="${logdir}/${formatted_server_node}_${test_mode}_remote_hosts"
	printf "REMOTE_HOSTS[0]=$server_ip\nREMOTE_HOSTS[1]=$server_ip\nNUM_REMOTE_HOSTS=2\n" >"$remote_host_file"
	chmod 777 "$remote_host_file"
	roachprod run "$client_node" -- sudo chmod 777 -R netperf
	roachprod put "$client_node" "$remote_host_file" netperf/doc/examples/${test_mode}_remote_hosts
	}

	# get_best_number_streams is to run a netperf TCP_STREAM test with
	# gradually incrementing the number of streams until the aggregate throughput
	# converges (the latest 3 agg throughput's std < 0.3). The best number of
	# streams will be saved in a file "num_streams" in the client node.
	function get_best_number_streams() {
	local client_node=$1
	local test_mode=$2
	echo "running getting best num of stream for $client_node"

	roachprod run "$client_node" -- "cd netperf/doc/examples && SEARCH_BEST_NUM_STREAMS=1 TEST_MODE=$test_mode ./runemomniaggdemo.sh"
	echo "get best number of stream for $client_node"
	}

	# run_netperf_between_server_client is to get the best number of streams
	# to run the throughput test between the server and client node, and run the
	# netperf latency and throughput test. We start the netserver on the server node
	# and run netperf command on the client node.
	# Note that in the cross-region case, we set the east node as the client node,
	# and the west node as the server node.
	function run_netperf_between_server_client() {

	local client_node=$1
	local server_node=$2

	local PORT=$3
	local test_mode=$4
	local netperf_extra_args=$5

	local server_ip=$(roachprod ip $server_node)

	roachprod run $client_node sudo ./scripts/gen/network-setup.sh
	roachprod run $server_node sudo ./scripts/gen/network-setup.sh

	# Start netserver on the server node.
	# It may give error, but it only means that the netserver is already running
	# on the given port, so we should proceed when that happens.
	set +e
	roachprod run $server_node ./scripts/gen/network-test.sh -- -S -p $PORT -m $server_node
	set -e

	# Mount a file containing server's ip to the client node.
	modify_remote_hosts_on_client_node $client_node $server_node $test_mode
	get_best_number_streams $client_node $test_mode
	run_under_tmux "${test_mode}-net" $client_node "./scripts/gen/network-test.sh -s $server_ip -p $PORT -m $test_mode -z $CLOUD-n2-standard-8 $netperf_extra_args"
	}

	# Run intra-az Netperf benchmark. The test will be run the 1st and the 2nd
	# nodes of the same cluster.
	function bench_intra_az_net() {
	if [ $NODES -lt 2 ]
	then
	echo "NODES must be greater than 1 for this test"
	exit 1
	fi

	local server_node="$CLUSTER":2
	local client_node="$CLUSTER":1

	run_netperf_between_server_client $client_node $server_node $INTER_AZ_PORT intra-az "$net_extra_args"
	}

	# Wait for Netperf benchmark to complete and fetch results.
	function fetch_bench_intra_az_net_results() {
	if [ $NODES -lt 2 ]
	then
	echo "NODES must be greater than 1 for this test"
	exit 1
	fi

	roachprod run ${CLUSTER}:1 ./scripts/gen/network-test.sh -- -w -m intra-az
	roachprod get ${CLUSTER}:1 ./intra-az-netperf-results $(results_dir "intra-az-netperf-results")
	}

	# bench_cross_region_net is run the cross-region network tests.
	function bench_cross_region_net() {
	create_west_cluster
	upload_scripts "$WEST_CLUSTER"
	setup_cluster "$WEST_CLUSTER"

	run_netperf_between_server_client ${CLUSTER}:1 ${WEST_CLUSTER}:1 $CROSS_REGION_PORT cross-region $cross_region_net_extra_args
	}

	# fetch_bench_cross_region_net_results is to wait the cross-region network test
	# to finish and the fetch the results from the server node.
	function fetch_bench_cross_region_net_results() {
	roachprod run ${CLUSTER}:1 ./scripts/gen/network-test.sh -- -w -m cross-region
	roachprod get ${CLUSTER}:1 ./cross-region-netperf-results $(results_dir "cross-region-netperf-results")
	}

	# Destroy roachprod cluster
	function destroy_cluster() {
	roachprod destroy "$CLUSTER"
	if [[ -n $west_cluster_created ]]; then
	roachprod destroy "$WEST_CLUSTER"
	fi
	}

	function usage() {
	echo "$1
	Usage: $0 [-b <bootstrap>]... [-w <workload>]... [-d] [-c cockroach_binary]
	-b: One or more bootstrap steps.
	-b create: creates cluster
	-b upload: uploads required scripts
	-b setup: execute setup script on the cluster
	-b all: all of the above steps
	-w: Specify workloads (benchmarks) to execute.
	-w cpu : Benchmark CPU
	-w io : Benchmark IO
	-w iofsync : Benchmark IO Fsync
	-w ia_net : Benchmark Net. Please don't run "ia_net" and "cr_net" on the same cluster.
	-w cr_net : Benchmark Cross-region Net. Please don't run "ia_net" and "cr_net" on the same cluster.
	-w tpcc: Benchmark TPCC
	-w all : All of the above
	-c: Override cockroach binary to stage (local path to binary or release version)
	-r: Do not start benchmarks specified by -w. Instead, resume waiting for their completion.
	-I: additional IO benchmark arguments
	-F: additional IO Fsync benchmark arguments
	-N: additional network benchmark arguments
	-C: additional CPU benchmark arguments
	-T: additional TPCC benchmark arguments
	-R: additional cross-region network benchmark arguments
	-n: override number of nodes in a cluster
	-d: Destroy cluster
	"
	exit 1
	}

	benchmarks=()
	f_resume=''
	do_create=''
	do_upload=''
	do_setup=''
	do_destroy=''
	io_extra_args=''
	iofsync_extra_args=''
	cpu_extra_args=''
	tpcc_extra_args=' -L "--provider-override=gs --bucket-override=cloud-report-tpcc" '
	intra_az_net_extra_args=''
	cross_region_net_extra_args=''
	cockroach_binary=''

	while getopts 'c:b:w:dn:I:F:N:C:T:R:r' flag; do
	case "${flag}" in
	b) case "${OPTARG}" in
	all)
	do_create='true'
	do_upload='true'
	do_setup='true'
	do_cockroach='true'
	;;
	create) do_create='true' ;;
	upload) do_upload='true' ;;
	setup) do_setup='true' ;;
	*) usage "Invalid -b value '${OPTARG}'" ;;
	esac
	;;
	c) cockroach_binary="${OPTARG}" ;;
	w) case "${OPTARG}" in
	cpu) benchmarks+=("bench_cpu") ;;
	io) benchmarks+=("bench_io") ;;
	iofsync) benchmarks+=("bench_iofsync") ;;
	ia_net) benchmarks+=("bench_intra_az_net") ;;
	cr_net) benchmarks+=("bench_cross_region_net") ;;
	tpcc) benchmarks+=("bench_tpcc") ;;
	all) benchmarks+=("bench_cpu" "bench_io" "bench_io_fsync" "bench_tpcc" "bench_cross_region_net") ;;
	*) usage "Invalid -w value '${OPTARG}'";;
	esac
	;;
	d) do_destroy='true' ;;
	r) f_resume='true' ;;
	n) NODES="${OPTARG}" ;;
	I) io_extra_args="${OPTARG}" ;;
	F) iofsync_extra_args="${OPTARG}" ;;
	C) cpu_extra_args="${OPTARG}" ;;
	T) tpcc_extra_args="${OPTARG}" ;;
	N) intra_az_net_extra_args="${OPTARG}" ;;
	R) cross_region_net_extra_args="${OPTARG}" ;;
	*) usage ;;
	esac
	done

	if [ -n "$do_create" ];
	then
	create_cluster
	fi

	if [ -n "$do_upload" ];
	then
	upload_scripts $CLUSTER
	load_cockroach $CLUSTER
	fi

	if [ -n "$do_setup" ];
	then
	setup_cluster $CLUSTER
	fi

	if [ -z "$f_resume" ]
	then
	# Execute requested benchmarks.
	for bench in "${benchmarks[@]}"
	do
	$bench
	done
	fi

	# Wait for benchmarks to finsh and fetch their results.
	for bench in "${benchmarks[@]}"
	do
	echo "Waiting for $bench to complete"
	fetch="fetch_${bench}_results"
	$fetch
	done

	if [ -n "$do_destroy" ];
	then
	destroy_cluster
	fi
	#!/bin/bash

	set -ex
	pidfile="$HOME/tpcc-bench.pid"
	f_force=''
	f_wait=''
	f_active=0
	f_warehouses=10000
	f_active_per_core=125
	f_skip_load=''
	f_load_args=''
	f_duration="30m"

	function usage() {
	echo "$1
	Usage: $0 [-f] [-w] [-s server] [pgurl,...]
	-f: ignore existing pid file; override and rerun.
	-w: wait for currently running benchmark to complete.
	-W: number of warehouses; default 2500
	-A: number of starting active warehouses
	-s: skip loading stage
	-L: extra args for load
	-a: number of active warehouses per core
	-d: duration; default 30m
	"
	exit 1
	}

	while getopts 'fwsW:A:d:a:L:' flag; do
	case "${flag}" in
	f) f_force='true' ;;
	w) f_wait='true' ;;
	W) f_warehouses="${OPTARG}" ;;
	A) f_active="${OPTARG}" ;;
	a) f_active_per_core="${OPTARG}" ;;
	s) f_skip_load='true' ;;
	d) f_duration="${OPTARG}" ;;
	L) f_load_args="${OPTARG}" ;;
	*) usage "";;
	esac
	done

	logdir="$HOME/tpcc-results"

	if [ -n "$f_wait" ];
	then
	exec sh -c "
	( test -f '$logdir/success' \|\|
	(test -f \"$pidfile\" && tail --pid \"$(cat $pidfile)\" -f /dev/null && test -f '$logdir/success')
	) \|\| (echo 'TPC-C benchmark did not complete successfully. Check logs'; exit 1)"
	fi

	echo "f_load_args:[$f_load_args]"
	echo "f_active_per_core:[$f_active_per_core]"


	if [ -f "$pidfile" ] && [ -z "$f_force" ];
	then
	pid=$(cat "$pidfile")
	echo "TPCC benchmark already running (pid $pid)"
	exit
	fi

	shift $((OPTIND - 1 ))
	pgurls=("$@")

	if [[ ${#pgurls[@]} == 0 ]];
	then
	usage "list of pgurls required"
	fi

	trap "rm -f $pidfile" EXIT SIGINT
	echo $$ > "$pidfile"

	rm -rf "$logdir"
	mkdir "$logdir"
	exec &> >(tee -a "$logdir/script.log")

	cd "$HOME"

	if [ -z "$f_skip_load" ]
	then
	#./cockroach sql --insecure --url "${pgurls[0]}" -e "
	# SET CLUSTER SETTING kv.snapshot_recovery.max_rate = '512 MiB';
	# SET CLUSTER SETTING kv.snapshot_rebalance.max_rate = '512 MiB';
	# SET CLUSTER SETTING admission.kv.enabled,=false;
	# SET CLUSTER SETTING admission.sql_kv_response.enabled=false;
	# SET CLUSTER SETTING admission.sql_sql_response.enabled=false;
	#";
	echo "Loading TPCC fixture for $f_warehouses warehouses ..."
	./cockroach workload fixtures load tpcc --checks=false --warehouses="$f_warehouses" $f_load_args "${pgurls[0]}"
	echo "done loading"
	fi

	if (( f_active == 0 ))
	then
	# Scale active warehouse count by f_active_per_core * number of CPUs.
	f_active=$(( f_active_per_core * $(cat /proc/cpuinfo \| grep processor \| wc -l) ))
	if (( f_active > f_warehouses ))
	then
	f_active=0
	fi
	fi

	report="${logdir}/tpcc-results-$f_active.txt"
	./cockroach workload run tpcc \
	--warehouses="$f_warehouses" --active-warehouses="$f_active" --ramp=5m --duration="$f_duration" --tolerate-errors --wait=0 \
	"${pgurls[@]}" > "$report"