Last active
June 21, 2023 05:30
-
-
Save owainlewis/1e7d1e68a6818ee4d50e to your computer and use it in GitHub Desktop.
Gzip Scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.{ByteArrayOutputStream, ByteArrayInputStream} | |
import java.util.zip.{GZIPOutputStream, GZIPInputStream} | |
import scala.util.Try | |
object Gzip { | |
def compress(input: Array[Byte]): Array[Byte] = { | |
val bos = new ByteArrayOutputStream(input.length) | |
val gzip = new GZIPOutputStream(bos) | |
gzip.write(input) | |
gzip.close() | |
val compressed = bos.toByteArray | |
bos.close() | |
compressed | |
} | |
def decompress(compressed: Array[Byte]): Option[String] = | |
Try { | |
val inputStream = new GZIPInputStream(new ByteArrayInputStream(compressed)) | |
scala.io.Source.fromInputStream(inputStream).mkString | |
}.toOption | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class GzipSpec extends WordSpecLike with Matchers { | |
"The GZIP object" should { | |
"decompress a compressed string" in { | |
val input = Gzip.compress("Hello World".getBytes("UTF-8")) | |
Gzip.decompress(input) shouldBe Some("Hello World") | |
} | |
} | |
} |
Is there any dependency we need include to run this code? I am getting following error message"Cannot resolve symbol wordspeclike"?
Thanks @owainlewis!
This is very nice and useful.
Just wondering - why does decompress return a String and not an Array[Byte]?
I think having Array[Byte] will make this useful for a variety of binary and non binary items.
I have same question as JThakrar.
Here's my version of decompress.
def decompress(compressed: Array[Byte]): Array[Byte] = {
val gzipInputStream = new GZIPInputStream(new ByteArrayInputStream(compressed))
val output = new ArrayBuffer[Byte]()
var totalByteCount = 0
val bytes = new Array[Byte](1024)
while (gzipInputStream.available() == 1) {
val byteCount = gzipInputStream.read(bytes)
if (byteCount > 0) {
output ++= bytes.take(byteCount)
totalByteCount += byteCount
}
}
output.take(totalByteCount).toArray
}
similar to @JThakrar's...probably simpler but requires Apache Commons IO
def decompress(compressed: Array[Byte]): Option[Array[Byte]] =
Try {
val inputStream = new GZIPInputStream(new ByteArrayInputStream(compressed))
org.apache.commons.io.IOUtils.toByteArray(inputStream)
}.toOption
Thanks. Extra example of sequential generating of gziped CSV files with os-lib:
Add the following to build.sbt
libraryDependencies += "com.lihaoyi" %% "os-lib" % "0.8.0"
then
package example
import scala.util.{Try, Success, Failure, Random}
import java.io.{ByteArrayOutputStream, ByteArrayInputStream}
import java.util.zip.{GZIPOutputStream, GZIPInputStream}
object Main extends App {
// create header for a csv file
val header = List("column1","column2", "column3")
val header_compressed = Gzip.compress(
(header.mkString(",")+"\n").getBytes("UTF-8"))
os.write.over(os.Path("/tmp/file.gz"), header_compressed)
(0 to 10).foreach { idx =>
val rand = Random
val lines_to_write = (0 to 100).map { _ =>
(0 to 2).map(_ => rand.nextInt(100).toString).mkString(",")
}
val new_lines = Gzip.compress(
(lines_to_write.mkString("\n")+"\n").getBytes("UTF-8"))
os.write.append(os.Path("/tmp/file.gz"), new_lines)
}
}
object Gzip {
def compress(input: Array[Byte]): Array[Byte] = {
val bos = new ByteArrayOutputStream(input.length)
val gzip = new GZIPOutputStream(bos)
gzip.write(input)
gzip.close()
val compressed = bos.toByteArray
bos.close()
compressed
}
def decompress(compressed: Array[Byte]): Option[String] =
Try {
val inputStream = new GZIPInputStream(new ByteArrayInputStream(compressed))
scala.io.Source.fromInputStream(inputStream).mkString
}.toOption
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@sbhola only if you need it in an ASCII string for sending via a transport like SMTP or embedding within some other file format. The raw binary data will be smaller than the Base64 representation.