Created
May 21, 2018 16:46
-
-
Save jose-goncabel/9b8224ccfa2ca9ac8346577c1ffe2771 to your computer and use it in GitHub Desktop.
Keras + Tensorflow + Spark - A PySpark script of how to use multiple GPUs for prediction within a Spark environment loading a pre-trained Keras model on each worker.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##### | |
# IMPORTS | |
##### | |
from pyspark import TaskContext | |
import os | |
##### | |
# PATHS | |
##### | |
path_model = "/path/to/pretrained/model.h5" | |
path_rdd = "/path/to/rdd" | |
##### | |
# STATIC VARIABLES | |
##### | |
gpus_available = 4 | |
##### | |
# RDD Creation - input_rdd | |
##### | |
input_rdd = sc.textFile(path_rdd) | |
##### | |
# Predict for partition function | |
##### | |
def predict_for_partition(partition): | |
# Get the id of the current partition | |
# The id will be used to assign work to one of the GPUS | |
ctx = TaskContext.get() | |
current_partition = ctx.partitionId() | |
# set enviornment | |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | |
os.environ["CUDA_VISIBLE_DEVICES"] = str(current_partition) | |
# Load the keras model | |
model = load_model(path_model) | |
# Iterate over the rows on the partition and predict | |
for row in partition: | |
prediction = model.predict(row) | |
yield prediction | |
# Force a repartition based on the amount of GPUS | |
input_rdd = input_rdd.repartition(gpus_available) | |
# Call map partitions to obtain the predictions | |
input_rdd = input_rdd.mapPartitions(predict_for_partition) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment