Skip to content

Instantly share code, notes, and snippets.

@mojaveazure
Created December 20, 2024 06:07
Show Gist options
  • Save mojaveazure/ce7b10447b85de1f360218b173f4516f to your computer and use it in GitHub Desktop.
Save mojaveazure/ce7b10447b85de1f360218b173f4516f to your computer and use it in GitHub Desktop.
Test TileDB-SOMA with Python `str`, `bytes`, R `character`, and `raw` data types
---
output:
slidy_presentation: default
pdf_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(
echo = TRUE,
tidy = TRUE,
message = FALSE,
warning = FALSE
)
```
```{r r-install, eval=FALSE}
# Install latest development version of tiledbsoma-r from R-universe
install.packages(
"tiledbsoma",
repos = c("https://mojaveazure.r-universe.dev", getOption("repos"))
)
```
```{r r-packages}
# Load R packages, set up Python environment
library(tiledbsoma)
library(reticulate)
reticulate::use_virtualenv("tiledbsoma", required = TRUE)
stopifnot("Cannot find tiledbsoma-py" = reticulate::py_module_available("tiledbsoma"))
names <- c("str", "bytes")
```
```{python py-modules}
# Import Python modules
import numpy
import pyarrow
import tiledbsoma
```
## Group-Level Meta Data: Python Write
```{r py-group-uri-init}
uri <- tempfile("py-group-meta")
```
```{python py-init-group}
grp = tiledbsoma.Collection.create(r.uri)
grp.metadata["str"] = "test_string"
grp.metadata["bytes"] = b"test_bytes"
grp.close()
```
```{python py-read-group-py}
grp = tiledbsoma.Collection.open(r.uri)
{x: type(grp.metadata[x]) for x in r.names}
grp.close()
```
```{r r-read-group-py}
grp <- SOMACollectionOpen(uri)
md <- grp$get_metadata()
sapply(names, \(x) class(md[[x]]$name))
grp$close()
```
## Group-Level Meta Data: R Write
```{r r-init-group, error=TRUE}
uri <- tempfile("r-group-meta")
grp <- SOMACollectionCreate(uri)
grp$set_metadata(list(str = "test_character"))
grp$set_metadata(list(bytes = charToRaw("test_raw"))) # R API errors out
grp$close()
```
```{r r-read-group-r}
grp <- SOMACollectionOpen(uri)
md <- grp$get_metadata()
sapply(names, \(x) class(md[[x]]$name))
grp$close()
```
```{python py-read-group-r}
grp = tiledbsoma.Collection.open(r.uri)
{x: type(grp.metadata.get(x)) for x in r.names}
grp.close()
```
## Array-Level Meta Data: Python Write
```{r py-array-uri-init}
uri <- tempfile("py-array-meta")
```
```{python py-init-array}
arr = tiledbsoma.SparseNDArray.create(
r.uri,
type=pyarrow.int32(),
shape=(200, 100)
)
arr.metadata["str"] = "test_string"
arr.metadata["bytes"] = b"test_bytes"
arr.close()
```
```{python py-read-array-py}
arr = tiledbsoma.open(r.uri)
{x: type(arr.metadata[x]) for x in r.names}
arr.close()
```
```{r r-read-array-py}
arr <- SOMASparseNDArrayOpen(uri)
md <- arr$get_metadata()
sapply(names, \(x) class(md[[x]]))
arr$close()
```
## Array-Level Meta Data: R Write
```{r r-init-array, error=TRUE}
uri <- tempfile("r-array-meta")
arr <- SOMASparseNDArrayCreate(
uri,
type = arrow::int32(),
shape = c(200L, 100L)
)
arr$set_metadata(list(str = "test_character"))
arr$set_metadata(list(bytes = charToRaw("test_raw"))) # R API errors out
arr$close()
```
```{r r-read-array-r}
arr <- SOMASparseNDArrayOpen(uri)
md <- arr$get_metadata()
sapply(names, \(x) class(md[[x]]))
arr$close()
```
```{python py-read-array-r}
arr = tiledbsoma.open(r.uri)
{x: type(arr.metadata.get(x)) for x in r.names}
arr.close()
```
## Dense ND Array: Python `str`
```{r py-dense-str-uri-init}
uri <- tempfile("py-dense-str")
```
```{python py-init-dense-str, error=TRUE}
shape = (200, 100)
pyarr = numpy.array([""] * numpy.prod(shape), dtype=numpy.str_)
pyarr.shape = shape
atype = pyarrow.infer_type(pyarr[0])
atype
arr = tiledbsoma.DenseNDArray.create(r.uri, type=atype, shape=pyarr.shape) # Python API errors out
# arr.write(pyarr)
# arr.close()
```
<!-- ```{python py-read-dense-str-py, eval=FALSE} -->
<!-- arr = tiledbsoma.open(r.uri) -->
<!-- {x: type(arr.metadata[x]) for x in r.names} -->
<!-- arr.close() -->
<!-- ``` -->
<!-- ```{r r-read-dense-str-py, eval=FALSE} -->
<!-- arr <- SOMASparseNDArrayOpen(uri) -->
<!-- md <- arr$get_metadata() -->
<!-- sapply(names, \(x) class(md[[x]])) -->
<!-- arr$close() -->
<!-- ``` -->
## Dense ND Array: Python `bytes`
```{r py-dense-bytes-uri-init}
uri <- tempfile("py-dense-bytes")
```
```{python py-init-dense-bytes, error=TRUE}
shape = (200, 100)
pyarr = numpy.array([b""] * numpy.prod(shape), dtype=numpy.bytes_)
pyarr.shape = shape
atype = pyarrow.infer_type(pyarr[0])
atype
arr = tiledbsoma.DenseNDArray.create(r.uri, type=atype, shape=pyarr.shape) # Python API errors out
# arr.write(pyarr)
# arr.close()
```
<!-- ```{python py-read-dense-bytes-py, eval=FALSE} -->
<!-- arr = tiledbsoma.open(r.uri) -->
<!-- {x: type(arr.metadata[x]) for x in r.names} -->
<!-- arr.close() -->
<!-- ``` -->
<!-- ```{r r-read-dense-bytes-py, eval=FALSE} -->
<!-- arr <- SOMASparseNDArrayOpen(uri) -->
<!-- md <- arr$get_metadata() -->
<!-- sapply(names, \(x) class(md[[x]])) -->
<!-- arr$close() -->
<!-- ``` -->
## Dense ND Array: R `character`
```{r r-init-dense-character}
uri <- tempfile("r-dense-character")
strings <- matrix("", nrow = 200L, ncol = 100L)
(atype <- arrow::infer_type(strings))
arr <- SOMADenseNDArrayCreate(uri, type = atype, shape = dim(strings))
arr$write(strings)
arr$close()
```
```{r r-read-dense-character-r}
arr <- SOMADenseNDArrayOpen(uri)
arr$read_arrow_table()$soma_data$type # Different than the type going in
arr$close()
```
```{python py-read-dense-character-r, error=TRUE}
arr = tiledbsoma.open(r.uri)
arr.read() # pyarrow errors out
arr.close()
```
## Dense ND Array: R `raw`
```{r r-init-dense-raw}
uri <- tempfile("r-dense-raw")
raws <- matrix(raw(length = 1L), nrow = 200L, ncol = 100L)
typeof(raws)
(atype <- arrow::infer_type(raws))
arr <- SOMADenseNDArrayCreate(uri, type = atype, shape = dim(raws))
arr$write(raws)
arr$close()
```
```{r r-read-dense-raw-r}
arr <- SOMADenseNDArrayOpen(uri)
tbl <- arr$read_arrow_table()
tbl$soma_data$type
typeof(tbl$soma_data$as_vector()) # not raw
arr$close()
```
```{python py-read-dense-raw-r}
arr = tiledbsoma.open(r.uri)
arr.read() # int, not bytes
arr.close()
```
## Sparse ND Array: Python `str`
```{r py-sparse-str-uri-init}
uri <- tempfile("py-dense-str")
```
```{python py-init-sparse-str, error=TRUE}
shape = (200, 100)
size = int(0.6 * numpy.prod(shape))
ij = numpy.random.randint(0, numpy.prod(shape) - 1, size=size)
tensor = pyarrow.SparseCOOTensor.from_numpy(
numpy.array([""] * size, dtype=numpy.str_),
numpy.array((ij % shape[0], ij // shape[0])),
shape=shape
) # pyarrow errors out
```
## Sparse ND Array: Python `bytes`
```{r py-sparse-bytes-uri-init}
uri <- tempfile("py-dense-str")
```
```{python py-init-sparse-bytes, error=TRUE}
shape = (200, 100)
size = int(0.6 * numpy.prod(shape))
ij = numpy.random.randint(0, numpy.prod(shape) - 1, size=size)
tensor = pyarrow.SparseCOOTensor.from_numpy(
numpy.array([b""] * size, dtype=numpy.bytes_),
numpy.array((ij % shape[0], ij // shape[0])),
shape=shape
) # pyarrow errors out
```
## Sparse ND Array: R `character`
```{r r-init-sparse-character}
uri <- tempfile("r-sparse-character")
shape <- c(200L, 100L)
ij <- sample.int(prod(shape), size = floor(0.6 * prod(shape))) - 1L
coo <- data.frame(
soma_dim_0 = ij %% shape[1L],
soma_dim_1 = ij %/% shape[1L],
soma_data = ""
)
(atype <- arrow::infer_type(coo$soma_data))
arr <- SOMASparseNDArrayCreate(uri, type = atype, shape = shape)
arr$.write_coordinates(coo)
arr$close()
```
```{r r-read-sparse-character-r}
arr <- SOMASparseNDArrayOpen(uri)
tbl <- arr$read()$tables()$concat()
tbl$soma_data$type # different than the type going in
arr$close()
```
```{python py-read-sparse-character-r}
arr = tiledbsoma.open(r.uri)
arr.read().tables().concat() # sparse arrays work
arr.close()
```
## Sparse ND Array: R `raw`
```{r r-init-sparse-raw}
uri <- tempfile("r-sparse-raw")
shape <- c(200L, 100L)
ij <- sample.int(prod(shape), size = floor(0.6 * prod(shape))) - 1L
coo <- data.frame(
soma_dim_0 = ij %% shape[1L],
soma_dim_1 = ij %/% shape[1L],
soma_data = raw(length = 1L)
)
sapply(coo, class)
typeof(coo$soma_data)
(atype <- arrow::infer_type(coo$soma_data))
arr <- SOMASparseNDArrayCreate(uri, type = atype, shape = shape)
arr$.write_coordinates(coo)
arr$close()
```
```{r r-read-sparse-raw-r}
arr <- SOMASparseNDArrayOpen(uri)
tbl <- arr$read()$tables()$concat()
tbl$soma_data$type
typeof(tbl$soma_data$as_vector()) # not raw
arr$close()
```
```{python py-read-sparse-raw-r}
arr = tiledbsoma.open(r.uri)
arr.read().tables().concat() # int, not bytes
arr.close()
```
## Data Frame: Python Write
```{r py-df-uri-init}
uri <- tempfile("py-df")
df <- data.frame(
soma_joinid = bit64::seq.integer64(0L, 99L),
str = character(100L),
bytes = raw(100L)
)
```
```{python py-init-df}
df = r.df
df["soma_joinid"] = numpy.array(range(0, df.shape[0]), dtype=numpy.int64)
df["bytes"] = [b""] * df.shape[0]
tbl = pyarrow.Table.from_pandas(df)
tbl
sdf = tiledbsoma.DataFrame.create(r.uri, schema=tbl.schema, domain=[(0, df.shape[0])])
_ = sdf.write(tbl)
sdf.close()
```
```{python py-read-df-py}
sdf = tiledbsoma.open(r.uri)
sdf.read().concat()
sdf.close()
```
```{r r-read-df-py}
sdf <- SOMADataFrameOpen(uri)
(tbl <- sdf$read()$concat())
tbl$to_data_frame()
sdf$close()
```
## Data Frame: R Write
```{r r-init-df}
uri <- tempfile("r-df")
tbl <- arrow::as_arrow_table(df)
sdf <- SOMADataFrameCreate(
uri,
schema = tbl$schema,
domain = list(soma_joinid = c(0L, 100L))
)
sdf$write(tbl)
sdf$close()
```
```{r r-read-df-r}
sdf <- SOMADataFrameOpen(uri)
sdf$read()$concat()
sdf$close()
```
```{python py-read-df-r}
sdf = tiledbsoma.open(r.uri)
sdf.read().concat()
sdf.close()
```
## Session Info
```{r sessioninfo}
show_package_versions()
packageVersion("reticulate")
```
```{python py-package-versions}
tiledbsoma.show_package_versions()
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment