Created
March 25, 2020 05:11
-
-
Save gbraccialli/d910d666bd7bdd71f7fedf9eebee6351 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#----------------------------------------------------------------------------------------------- | |
#copy emr conf | |
#----------------------------------------------------------------------------------------------- | |
emr_ip=10.135.241.137 | |
sudo rm -rf /etc/yum.repos.d/emr-*.repo | |
sudo rm -rf /var/aws/emr/repoPublicKey.txt | |
sudo mkdir -p /var/aws/emr/ | |
sudo chmod +r -R /var/aws/ | |
sudo rm -rf /etc/spark/ | |
sudo rm -rf /etc/hadoop/ | |
sudo scp -i xxx.pem hadoop@${emr_ip}:/etc/yum.repos.d/emr-*.repo /etc/yum.repos.d/ | |
sudo scp -i xxxx.pem hadoop@${emr_ip}:/var/aws/emr/repoPublicKey.txt /var/aws/emr/ | |
sudo scp -rp -i xxx.pem hadoop@${emr_ip}:/etc/spark/ /etc/spark/ | |
sudo scp -rp -i /xxx.pem hadoop@${emr_ip}:/etc/hadoop/ /etc/hadoop/ | |
#----------------------------------------------------------------------------------------------- | |
#install emr binaries | |
#----------------------------------------------------------------------------------------------- | |
sudo mkdir -p /var/aws/emr/ | |
sudo mkdir -p /etc/hadoop/conf | |
sudo mkdir -p /etc/spark/conf | |
sudo mkdir -p /var/log/spark/user/ | |
sudo mkdir -p /mnt/s3 | |
sudo mkdir -p /mnt/var/lib/hadoop/tmp | |
sudo chmod 777 -R /var/log/spark/ | |
sudo chmod 777 -R /mnt/s3 | |
sudo chmod 777 -R /mnt/var/lib/hadoop/tmp | |
sudo yum install -y hadoop-client | |
sudo yum install -y hadoop-hdfs | |
sudo yum install -y hadoop-lzo | |
sudo yum install -y spark-core | |
sudo yum install -y spark-python | |
sudo yum install -y aws-java-sdk | |
sudo yum install -y aws-sagemaker-spark-sdk | |
sudo yum install -y emr-ddb | |
sudo yum install -y emr-ddb-hadoop | |
sudo yum install -y emr-ddb-hive | |
sudo yum install -y emr-goodies | |
sudo yum install -y emr-goodies-hadoop | |
sudo yum install -y emr-goodies-hive | |
sudo yum install -y emr-goodies-parquet | |
sudo yum install -y emr-goodies-spark | |
sudo yum install -y emr-s3-select | |
sudo yum install -y emrfs | |
sudo yum install -y livy | |
sudo yum install -y spark-core | |
sudo yum install -y spark-datanucleus | |
sudo yum install -y spark-external | |
sudo yum install -y spark-history-server | |
sudo yum install -y spark-python | |
sudo yum install -y git | |
#----------------------------------------------------------------------------------------------- | |
#install anaconda and create conda env | |
#----------------------------------------------------------------------------------------------- | |
wget https://repo.anaconda.com/archive/Anaconda3-2019.10-Linux-x86_64.sh | |
chmod +x Anaconda3-2019.10-Linux-x86_64.sh | |
sudo ./Anaconda3-2019.10-Linux-x86_64.sh -b -p /anaconda3 | |
rm -rf Anaconda3-2019.10-Linux-x86_64.sh | |
sudo /anaconda3/bin/conda install -y -c conda-forge jupyterhub | |
sudo /anaconda3/bin/pip install jupyter | |
sudo /anaconda3/bin/pip install jupyterlab | |
sudo /anaconda3/bin/pip install jupyterhub | |
sudo /anaconda3/bin/pip install findspark | |
sudo /anaconda3/bin/pip install pandas | |
sudo /anaconda3/bin/pip install pyarrow | |
sudo /anaconda3/bin/pip install pandas-profiling | |
sudo /anaconda3/bin/pip install s3contents | |
sudo /anaconda3/bin/pip install hybridcontents | |
sudo /anaconda3/bin/pip install jupyter-server-proxy | |
sudo /anaconda3/bin/pip install ipykernel | |
sudo /anaconda3/bin/jupyter serverextension enable --sys-prefix jupyter_server_proxy | |
sudo /anaconda3/bin/conda init | |
sudo /anaconda3/bin/conda create -y -n xxxx python=3.6 | |
sudo /anaconda3/bin/conda create -y -n yyyy python=3.6 | |
#----------------------------------------------------------------------------------------------- | |
#start jupyter hub and create kernels | |
#----------------------------------------------------------------------------------------------- | |
sudo su - | |
conda activate xxxx | |
pip install ipykernel | |
python -m ipykernel install --name xxxx --display-name "xxxx" | |
conda deactivate | |
conda activate yyyy | |
pip install ipykernel | |
python -m ipykernel install --name yyyy --display-name "yyyy" | |
sudo chmod 777 -R /anaconda3 | |
sudo adduser jupyter | |
sudo passwd jupyter | |
#----------------------------------------------------------------------------------------------- | |
#configure jupyter notebook-s3 storage | |
#----------------------------------------------------------------------------------------------- | |
sudo su - jupyter | |
#create file: /home/jupyter/.jupyter/jupyter_notebook_config.py | |
from s3contents import S3ContentsManager | |
c = get_config() | |
from hybridcontents import HybridContentsManager | |
#from IPython.html.services.contents.filemanager import FileContentsManager | |
from notebook.services.contents.filemanager import FileContentsManager | |
c.NotebookApp.contents_manager_class = HybridContentsManager | |
c.HybridContentsManager.manager_classes = { | |
# Associate the root directory with an S3ContentsManager. | |
# This manager will receive all requests that don"t fall under any of the | |
# other managers. | |
"": S3ContentsManager, | |
# Associate /directory with a FileContentsManager. | |
"xx_local_directory": FileContentsManager, | |
} | |
c.HybridContentsManager.manager_kwargs = { | |
# Args for root S3ContentsManager. | |
"": { | |
"bucket": "xxxxx", | |
"prefix": "jupyter_notebooks", | |
"sse": "AES256", | |
"signature_version": "s3v4" | |
}, | |
# Args for the FileContentsManager mapped to /directory | |
"xx_local_directory": { | |
"root_dir": "/home/jupyter", | |
}, | |
} | |
#----------------------------------------------------------------------------------------------- | |
#setup gpu and add kernel | |
#----------------------------------------------------------------------------------------------- | |
sudo wget http://us.download.nvidia.com/tesla/440.64.00/NVIDIA-Linux-x86_64-440.64.00.run -O /root/nvidia_install.run | |
sudo yum groupinstall -y "Development Tools" | |
sudo yum install -y kernel-tools kernel-tools-devel kernel kernel-devel | |
sydo yum install -y kernel-devel-$(uname -r) | |
sudo chmod +x /root/nvidia_install.run | |
sudo /root/nvidia_install.run | |
sudo /anaconda3/bin/conda create -y -n gpu python=3.7 | |
sudo su - | |
conda activate gpu | |
conda install -c conda-forge pytorch cudatoolkit=10.1 | |
pip install ipykernel | |
python -m ipykernel install --name gpu --display-name "gpu" | |
#----------------------------------------------------------------------------------------------- | |
#/etc/hadoop/conf/yarn-site.xml | |
#----------------------------------------------------------------------------------------------- | |
<property> | |
<name>yarn.resourcemanager.scheduler.class</name> | |
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value> | |
</property> | |
<property> | |
<name>yarn.scheduler.fair.user-as-default-queue</name> | |
<value>false</value> | |
</property> | |
<property> | |
<name>yarn.scheduler.fair.preemption</name> | |
<value>true</value> | |
</property> | |
<property> | |
<name>yarn.scheduler.fair.preemption.cluster-utilization-threshold</name> | |
<value>0.3f</value> | |
</property> | |
<property> | |
<name>yarn.scheduler.fair.waitTimeBeforeNextStarvationCheck</name> | |
<value>3000</value> | |
</property> | |
<property> | |
<name>yarn.scheduler.fair.waitTimeBeforeKill</name> | |
<value>3000</value> | |
</property> | |
</configuration> | |
#----------------------------------------------------------------------------------------------- | |
#/etc/hadoop/conf/fair-scheduler.xml | |
#----------------------------------------------------------------------------------------------- | |
<?xml version="1.0"?> | |
<allocations> | |
<queue name="root"> | |
<minSharePreemptionTimeout>3</minSharePreemptionTimeout> | |
<fairSharePreemptionTimeout>3</fairSharePreemptionTimeout> | |
<allowPreemptionFrom>true</allowPreemptionFrom> | |
<minResources>100000 mb,0vcores</minResources> | |
<schedulingPolicy>fair</schedulingPolicy> | |
</queue> | |
<defaultFairSharePreemptionTimeout>3</defaultFairSharePreemptionTimeout> | |
<defaultMinSharePreemptionTimeout>3</defaultMinSharePreemptionTimeout> | |
<defaultFairSharePreemptionThreshold>0.8f</defaultFairSharePreemptionThreshold> | |
</allocations> | |
#----------------------------------------------------------------------------------------------- | |
#/etc/spark/conf/spark-defaults.conf (worker instance type = r5.2xlarge (64 GB / 8 cores)) | |
#------------------------------------------------------------------ | |
spark.executor.memory 12000M | |
spark.executor.cores 2 | |
spark.yarn.executor.memoryOverheadFactor 0.1875 | |
spark.driver.memory 8g | |
spark.driver.maxResultSize 16g | |
spark.driver.memoryOverhead 4g | |
spark.dynamicAllocation.executorIdleTimeout 30s | |
spark.dynamicAllocation.cachedExecutorIdleTimeout 30s | |
#----------------------------------------------------------------------------------------------- | |
#git setup | |
#----------------------------------------------------------------------------------------------- | |
git config --global credential.helper store | |
git clone https://github.com/xxxx | |
[email protected] | |
xxxxxxxx | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment