-
-
Save mojaveazure/ce7b10447b85de1f360218b173f4516f to your computer and use it in GitHub Desktop.
Test TileDB-SOMA with Python `str`, `bytes`, R `character`, and `raw` data types
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
output: | |
slidy_presentation: default | |
pdf_document: default | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set( | |
echo = TRUE, | |
tidy = TRUE, | |
message = FALSE, | |
warning = FALSE | |
) | |
``` | |
```{r r-install, eval=FALSE} | |
# Install latest development version of tiledbsoma-r from R-universe | |
install.packages( | |
"tiledbsoma", | |
repos = c("https://mojaveazure.r-universe.dev", getOption("repos")) | |
) | |
``` | |
```{r r-packages} | |
# Load R packages, set up Python environment | |
library(tiledbsoma) | |
library(reticulate) | |
reticulate::use_virtualenv("tiledbsoma", required = TRUE) | |
stopifnot("Cannot find tiledbsoma-py" = reticulate::py_module_available("tiledbsoma")) | |
names <- c("str", "bytes") | |
``` | |
```{python py-modules} | |
# Import Python modules | |
import numpy | |
import pyarrow | |
import tiledbsoma | |
``` | |
## Group-Level Meta Data: Python Write | |
```{r py-group-uri-init} | |
uri <- tempfile("py-group-meta") | |
``` | |
```{python py-init-group} | |
grp = tiledbsoma.Collection.create(r.uri) | |
grp.metadata["str"] = "test_string" | |
grp.metadata["bytes"] = b"test_bytes" | |
grp.close() | |
``` | |
```{python py-read-group-py} | |
grp = tiledbsoma.Collection.open(r.uri) | |
{x: type(grp.metadata[x]) for x in r.names} | |
grp.close() | |
``` | |
```{r r-read-group-py} | |
grp <- SOMACollectionOpen(uri) | |
md <- grp$get_metadata() | |
sapply(names, \(x) class(md[[x]]$name)) | |
grp$close() | |
``` | |
## Group-Level Meta Data: R Write | |
```{r r-init-group, error=TRUE} | |
uri <- tempfile("r-group-meta") | |
grp <- SOMACollectionCreate(uri) | |
grp$set_metadata(list(str = "test_character")) | |
grp$set_metadata(list(bytes = charToRaw("test_raw"))) # R API errors out | |
grp$close() | |
``` | |
```{r r-read-group-r} | |
grp <- SOMACollectionOpen(uri) | |
md <- grp$get_metadata() | |
sapply(names, \(x) class(md[[x]]$name)) | |
grp$close() | |
``` | |
```{python py-read-group-r} | |
grp = tiledbsoma.Collection.open(r.uri) | |
{x: type(grp.metadata.get(x)) for x in r.names} | |
grp.close() | |
``` | |
## Array-Level Meta Data: Python Write | |
```{r py-array-uri-init} | |
uri <- tempfile("py-array-meta") | |
``` | |
```{python py-init-array} | |
arr = tiledbsoma.SparseNDArray.create( | |
r.uri, | |
type=pyarrow.int32(), | |
shape=(200, 100) | |
) | |
arr.metadata["str"] = "test_string" | |
arr.metadata["bytes"] = b"test_bytes" | |
arr.close() | |
``` | |
```{python py-read-array-py} | |
arr = tiledbsoma.open(r.uri) | |
{x: type(arr.metadata[x]) for x in r.names} | |
arr.close() | |
``` | |
```{r r-read-array-py} | |
arr <- SOMASparseNDArrayOpen(uri) | |
md <- arr$get_metadata() | |
sapply(names, \(x) class(md[[x]])) | |
arr$close() | |
``` | |
## Array-Level Meta Data: R Write | |
```{r r-init-array, error=TRUE} | |
uri <- tempfile("r-array-meta") | |
arr <- SOMASparseNDArrayCreate( | |
uri, | |
type = arrow::int32(), | |
shape = c(200L, 100L) | |
) | |
arr$set_metadata(list(str = "test_character")) | |
arr$set_metadata(list(bytes = charToRaw("test_raw"))) # R API errors out | |
arr$close() | |
``` | |
```{r r-read-array-r} | |
arr <- SOMASparseNDArrayOpen(uri) | |
md <- arr$get_metadata() | |
sapply(names, \(x) class(md[[x]])) | |
arr$close() | |
``` | |
```{python py-read-array-r} | |
arr = tiledbsoma.open(r.uri) | |
{x: type(arr.metadata.get(x)) for x in r.names} | |
arr.close() | |
``` | |
## Dense ND Array: Python `str` | |
```{r py-dense-str-uri-init} | |
uri <- tempfile("py-dense-str") | |
``` | |
```{python py-init-dense-str, error=TRUE} | |
shape = (200, 100) | |
pyarr = numpy.array([""] * numpy.prod(shape), dtype=numpy.str_) | |
pyarr.shape = shape | |
atype = pyarrow.infer_type(pyarr[0]) | |
atype | |
arr = tiledbsoma.DenseNDArray.create(r.uri, type=atype, shape=pyarr.shape) # Python API errors out | |
# arr.write(pyarr) | |
# arr.close() | |
``` | |
<!-- ```{python py-read-dense-str-py, eval=FALSE} --> | |
<!-- arr = tiledbsoma.open(r.uri) --> | |
<!-- {x: type(arr.metadata[x]) for x in r.names} --> | |
<!-- arr.close() --> | |
<!-- ``` --> | |
<!-- ```{r r-read-dense-str-py, eval=FALSE} --> | |
<!-- arr <- SOMASparseNDArrayOpen(uri) --> | |
<!-- md <- arr$get_metadata() --> | |
<!-- sapply(names, \(x) class(md[[x]])) --> | |
<!-- arr$close() --> | |
<!-- ``` --> | |
## Dense ND Array: Python `bytes` | |
```{r py-dense-bytes-uri-init} | |
uri <- tempfile("py-dense-bytes") | |
``` | |
```{python py-init-dense-bytes, error=TRUE} | |
shape = (200, 100) | |
pyarr = numpy.array([b""] * numpy.prod(shape), dtype=numpy.bytes_) | |
pyarr.shape = shape | |
atype = pyarrow.infer_type(pyarr[0]) | |
atype | |
arr = tiledbsoma.DenseNDArray.create(r.uri, type=atype, shape=pyarr.shape) # Python API errors out | |
# arr.write(pyarr) | |
# arr.close() | |
``` | |
<!-- ```{python py-read-dense-bytes-py, eval=FALSE} --> | |
<!-- arr = tiledbsoma.open(r.uri) --> | |
<!-- {x: type(arr.metadata[x]) for x in r.names} --> | |
<!-- arr.close() --> | |
<!-- ``` --> | |
<!-- ```{r r-read-dense-bytes-py, eval=FALSE} --> | |
<!-- arr <- SOMASparseNDArrayOpen(uri) --> | |
<!-- md <- arr$get_metadata() --> | |
<!-- sapply(names, \(x) class(md[[x]])) --> | |
<!-- arr$close() --> | |
<!-- ``` --> | |
## Dense ND Array: R `character` | |
```{r r-init-dense-character} | |
uri <- tempfile("r-dense-character") | |
strings <- matrix("", nrow = 200L, ncol = 100L) | |
(atype <- arrow::infer_type(strings)) | |
arr <- SOMADenseNDArrayCreate(uri, type = atype, shape = dim(strings)) | |
arr$write(strings) | |
arr$close() | |
``` | |
```{r r-read-dense-character-r} | |
arr <- SOMADenseNDArrayOpen(uri) | |
arr$read_arrow_table()$soma_data$type # Different than the type going in | |
arr$close() | |
``` | |
```{python py-read-dense-character-r, error=TRUE} | |
arr = tiledbsoma.open(r.uri) | |
arr.read() # pyarrow errors out | |
arr.close() | |
``` | |
## Dense ND Array: R `raw` | |
```{r r-init-dense-raw} | |
uri <- tempfile("r-dense-raw") | |
raws <- matrix(raw(length = 1L), nrow = 200L, ncol = 100L) | |
typeof(raws) | |
(atype <- arrow::infer_type(raws)) | |
arr <- SOMADenseNDArrayCreate(uri, type = atype, shape = dim(raws)) | |
arr$write(raws) | |
arr$close() | |
``` | |
```{r r-read-dense-raw-r} | |
arr <- SOMADenseNDArrayOpen(uri) | |
tbl <- arr$read_arrow_table() | |
tbl$soma_data$type | |
typeof(tbl$soma_data$as_vector()) # not raw | |
arr$close() | |
``` | |
```{python py-read-dense-raw-r} | |
arr = tiledbsoma.open(r.uri) | |
arr.read() # int, not bytes | |
arr.close() | |
``` | |
## Sparse ND Array: Python `str` | |
```{r py-sparse-str-uri-init} | |
uri <- tempfile("py-dense-str") | |
``` | |
```{python py-init-sparse-str, error=TRUE} | |
shape = (200, 100) | |
size = int(0.6 * numpy.prod(shape)) | |
ij = numpy.random.randint(0, numpy.prod(shape) - 1, size=size) | |
tensor = pyarrow.SparseCOOTensor.from_numpy( | |
numpy.array([""] * size, dtype=numpy.str_), | |
numpy.array((ij % shape[0], ij // shape[0])), | |
shape=shape | |
) # pyarrow errors out | |
``` | |
## Sparse ND Array: Python `bytes` | |
```{r py-sparse-bytes-uri-init} | |
uri <- tempfile("py-dense-str") | |
``` | |
```{python py-init-sparse-bytes, error=TRUE} | |
shape = (200, 100) | |
size = int(0.6 * numpy.prod(shape)) | |
ij = numpy.random.randint(0, numpy.prod(shape) - 1, size=size) | |
tensor = pyarrow.SparseCOOTensor.from_numpy( | |
numpy.array([b""] * size, dtype=numpy.bytes_), | |
numpy.array((ij % shape[0], ij // shape[0])), | |
shape=shape | |
) # pyarrow errors out | |
``` | |
## Sparse ND Array: R `character` | |
```{r r-init-sparse-character} | |
uri <- tempfile("r-sparse-character") | |
shape <- c(200L, 100L) | |
ij <- sample.int(prod(shape), size = floor(0.6 * prod(shape))) - 1L | |
coo <- data.frame( | |
soma_dim_0 = ij %% shape[1L], | |
soma_dim_1 = ij %/% shape[1L], | |
soma_data = "" | |
) | |
(atype <- arrow::infer_type(coo$soma_data)) | |
arr <- SOMASparseNDArrayCreate(uri, type = atype, shape = shape) | |
arr$.write_coordinates(coo) | |
arr$close() | |
``` | |
```{r r-read-sparse-character-r} | |
arr <- SOMASparseNDArrayOpen(uri) | |
tbl <- arr$read()$tables()$concat() | |
tbl$soma_data$type # different than the type going in | |
arr$close() | |
``` | |
```{python py-read-sparse-character-r} | |
arr = tiledbsoma.open(r.uri) | |
arr.read().tables().concat() # sparse arrays work | |
arr.close() | |
``` | |
## Sparse ND Array: R `raw` | |
```{r r-init-sparse-raw} | |
uri <- tempfile("r-sparse-raw") | |
shape <- c(200L, 100L) | |
ij <- sample.int(prod(shape), size = floor(0.6 * prod(shape))) - 1L | |
coo <- data.frame( | |
soma_dim_0 = ij %% shape[1L], | |
soma_dim_1 = ij %/% shape[1L], | |
soma_data = raw(length = 1L) | |
) | |
sapply(coo, class) | |
typeof(coo$soma_data) | |
(atype <- arrow::infer_type(coo$soma_data)) | |
arr <- SOMASparseNDArrayCreate(uri, type = atype, shape = shape) | |
arr$.write_coordinates(coo) | |
arr$close() | |
``` | |
```{r r-read-sparse-raw-r} | |
arr <- SOMASparseNDArrayOpen(uri) | |
tbl <- arr$read()$tables()$concat() | |
tbl$soma_data$type | |
typeof(tbl$soma_data$as_vector()) # not raw | |
arr$close() | |
``` | |
```{python py-read-sparse-raw-r} | |
arr = tiledbsoma.open(r.uri) | |
arr.read().tables().concat() # int, not bytes | |
arr.close() | |
``` | |
## Data Frame: Python Write | |
```{r py-df-uri-init} | |
uri <- tempfile("py-df") | |
df <- data.frame( | |
soma_joinid = bit64::seq.integer64(0L, 99L), | |
str = character(100L), | |
bytes = raw(100L) | |
) | |
``` | |
```{python py-init-df} | |
df = r.df | |
df["soma_joinid"] = numpy.array(range(0, df.shape[0]), dtype=numpy.int64) | |
df["bytes"] = [b""] * df.shape[0] | |
tbl = pyarrow.Table.from_pandas(df) | |
tbl | |
sdf = tiledbsoma.DataFrame.create(r.uri, schema=tbl.schema, domain=[(0, df.shape[0])]) | |
_ = sdf.write(tbl) | |
sdf.close() | |
``` | |
```{python py-read-df-py} | |
sdf = tiledbsoma.open(r.uri) | |
sdf.read().concat() | |
sdf.close() | |
``` | |
```{r r-read-df-py} | |
sdf <- SOMADataFrameOpen(uri) | |
(tbl <- sdf$read()$concat()) | |
tbl$to_data_frame() | |
sdf$close() | |
``` | |
## Data Frame: R Write | |
```{r r-init-df} | |
uri <- tempfile("r-df") | |
tbl <- arrow::as_arrow_table(df) | |
sdf <- SOMADataFrameCreate( | |
uri, | |
schema = tbl$schema, | |
domain = list(soma_joinid = c(0L, 100L)) | |
) | |
sdf$write(tbl) | |
sdf$close() | |
``` | |
```{r r-read-df-r} | |
sdf <- SOMADataFrameOpen(uri) | |
sdf$read()$concat() | |
sdf$close() | |
``` | |
```{python py-read-df-r} | |
sdf = tiledbsoma.open(r.uri) | |
sdf.read().concat() | |
sdf.close() | |
``` | |
## Session Info | |
```{r sessioninfo} | |
show_package_versions() | |
packageVersion("reticulate") | |
``` | |
```{python py-package-versions} | |
tiledbsoma.show_package_versions() | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment