Skip to content

Instantly share code, notes, and snippets.

@gbraccialli
Last active March 25, 2020 05:06
Show Gist options
  • Save gbraccialli/a7d7d1288563085f4a8e6e904c9609b3 to your computer and use it in GitHub Desktop.
Save gbraccialli/a7d7d1288563085f4a8e6e904c9609b3 to your computer and use it in GitHub Desktop.
mkdir ~/spark
cd ~/spark
wget https://archive.apache.org/dist/spark/spark-2.4.3/spark-2.4.3-bin-without-hadoop.tgz
wget https://archive.apache.org/dist/hadoop/core/hadoop-3.1.1/hadoop-3.1.1.tar.gz
tar xvf hadoop-3.1.1.tar.gz
tar xvf spark-2.4.3-bin-without-hadoop.tgz
cd ~
##################
def init_spark(spark_hadoop_path):
import os
# fmt: off
os.environ["SPARK_DIST_CLASSPATH"] = os.pathsep.join(
[os.path.join(spark_hadoop_path, "hadoop-3.1.1", "etc", "hadoop"),
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "tools", "lib", "*"),
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "common", "lib", "*"),
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "common", "*"),
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "hdfs"),
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "hdfs", "lib", "*"),
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "hdfs", "*"),
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "mapreduce", "lib", "*"),
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "mapreduce", "*"),
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "yarn"),
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "yarn", "lib", "*"),
os.path.join(spark_hadoop_path, "hadoop-3.1.1", "share", "hadoop", "yarn", "*"),
])
# fmt: on
import findspark
findspark.init(
os.path.join(spark_hadoop_path, "spark-2.4.3-bin-without-hadoop")
)
from pyspark.sql import SparkSession
spark = (
SparkSession.builder.config("spark.driver.memory", "8g")
.config("spark.driver.maxResultSize", "4g")
.master("local[*]")
.config("spark.sql.execution.arrow.enabled", "true")
.appName("xxx")
.getOrCreate()
)
return spark
spark = init_spark("/opt/spark")
from pyspark.sql import functions as F
from pyspark.sql.window import Window
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment