Last active
March 25, 2020 05:06
-
-
Save gbraccialli/a7d7d1288563085f4a8e6e904c9609b3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mkdir ~/spark | |
cd ~/spark | |
wget https://archive.apache.org/dist/spark/spark-2.4.3/spark-2.4.3-bin-without-hadoop.tgz | |
wget https://archive.apache.org/dist/hadoop/core/hadoop-3.1.1/hadoop-3.1.1.tar.gz | |
tar xvf hadoop-3.1.1.tar.gz | |
tar xvf spark-2.4.3-bin-without-hadoop.tgz | |
cd ~ | |
################## | |
def init_spark(spark_hadoop_path): | |
import os | |
# fmt: off | |
os.environ["SPARK_DIST_CLASSPATH"] = os.pathsep.join( | |
[os.path.join(spark_hadoop_path, "hadoop-3.1.1", "etc", "hadoop"), | |
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "tools", "lib", "*"), | |
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "common", "lib", "*"), | |
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "common", "*"), | |
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "hdfs"), | |
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "hdfs", "lib", "*"), | |
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "hdfs", "*"), | |
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "mapreduce", "lib", "*"), | |
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "mapreduce", "*"), | |
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "yarn"), | |
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "yarn", "lib", "*"), | |
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "yarn", "*"), | |
]) | |
# fmt: on | |
import findspark | |
findspark.init( | |
os.path.join(spark_hadoop_path, "spark-2.4.3-bin-without-hadoop") | |
) | |
from pyspark.sql import SparkSession | |
spark = ( | |
SparkSession.builder.config("spark.driver.memory", "8g") | |
.config("spark.driver.maxResultSize", "4g") | |
.master("local[*]") | |
.config("spark.sql.execution.arrow.enabled", "true") | |
.appName("xxx") | |
.getOrCreate() | |
) | |
return spark | |
spark = init_spark("/opt/spark") | |
from pyspark.sql import functions as F | |
from pyspark.sql.window import Window |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment