Source code for megafish.load

import os

import dask.config
import numpy as np
import pandas as pd
from tqdm import tqdm
import zarr
import xarray as xr
import dask
from dask.diagnostics import ProgressBar
import dask.array as da
from imaris_ims_file_reader.ims import ims
import tifffile

from .utils import natural_sort, get_tile_yx
from .config import show_resource



[docs]
def make_dirlist(dirlist_path, image_dir):
    """
    Generates a CSV file listing all cycle directories within a specified image directory.
    Each cycle directory must contain tiled images organized by color, z, y, and x.


    Args:
        dirlist_path (str): The file path to save the generated directory list CSV.
        image_dir (str): The path to the main directory containing subfolders for each cycle.

    Returns:
        None: The function creates a CSV file at dirlist_path.
    """
    dirs = os.listdir(image_dir)
    dirs = [os.path.join(image_dir, dir_) for dir_ in dirs]
    dirs = natural_sort(dirs)
    dirs = [dir_ for dir_ in dirs if os.path.isdir(dir_)]

    df = pd.DataFrame({"folder": dirs})

    if not os.path.exists(os.path.dirname(dirlist_path)):
        os.makedirs(os.path.dirname(dirlist_path))
    df.to_csv(dirlist_path, index=False)




[docs]
def make_imagepath_cYX_from_dirlist(
        zarr_path, groups, channels, n_cycle, n_tile_y, n_tile_x, scan_type,
        dirlist_path, subfooter="", footer="_imagepath", ext=".tif"):
    """
    Generates a CSV file mapping image paths to cycle, tile, and channel information for spatial-omics images.

    Args:
        zarr_path (str): Path to the base .zarr file.
        groups (list of str): List of group names, each corresponding to a specific analysis group.
        channels (list of str): List of channels corresponding to the groups.
        n_cycle (int): Total number of cycles to process.
        n_tile_y (int): Number of tiles along the y-axis.
        n_tile_x (int): Number of tiles along the x-axis.
        scan_type (str): Type of scan, determining the tile layout.
        dirlist_path (str): Path to the CSV file with the list of cycle directories.
        subfooter (str, optional): String to append before the footer in the output CSV filename; defaults to an empty string.
        footer (str, optional): String appended to the output CSV filename; defaults to "_imagepath".
        ext (str, optional): File extension of the image files; defaults to ".tif".

    Returns:
        None: The function creates a CSV file with the generated image paths and associated metadata at the modified `zarr_path`.
    """
    # Define output path for image paths CSV
    imagepath_path = zarr_path.replace(
        ".zarr", subfooter + footer + ".csv")

    # Generate a list of tile coordinates (y, x) based on scan type
    tile_yxs = get_tile_yx(n_tile_y, n_tile_x, scan_type)

    # Initialize lists to store data for each CSV column
    group_rows = []
    cycle_rows = []
    tile_y_rows = []
    tile_x_rows = []
    path_rows = []
    channel_rows = []

    # Read the list of cycle directories from dirlist_path CSV
    df_dirlist = pd.read_csv(dirlist_path)
    dirs = df_dirlist["folder"].values

    # Iterate over each cycle directory
    for cycle, dir_ in enumerate(dirs):
        # List all files in the cycle directory
        files = os.listdir(dir_)
        for group_name, channel in zip(groups, channels):
            # Filter and sort files with specified extension
            files = [file for file in files if file.endswith(ext)]
            files = natural_sort(files)
            # Map each tile coordinate to a file path
            for tile_yx, file in zip(tile_yxs, files):
                tile_y, tile_x = tile_yx
                path = os.path.join(dir_, file)
                # Append data for each row
                group_rows.append(group_name)
                cycle_rows.append(cycle + 1)
                tile_y_rows.append(tile_y + 1)
                tile_x_rows.append(tile_x + 1)
                path_rows.append(path)
                channel_rows.append(channel)

    # Create a DataFrame and save the output CSV with image paths
    df = pd.DataFrame({
        "group": group_rows, "cycle": cycle_rows,
        "tile_y": tile_y_rows, "tile_x": tile_x_rows,
        "path": path_rows, "channel": channel_rows})
    df.to_csv(imagepath_path, index=False)




[docs]
def make_imagepath_cYX(
        zarr_path, groups, channels, n_cycle, n_tile_y, n_tile_x, scan_type,
        image_dir, subfooter="", footer="_imagepath", ext=".ims"):
    """
    Generates a CSV file mapping image paths to cycle, tile, and channel information for spatial-omics data.

    Args:
        zarr_path (str): Path to the base .zarr file.
        groups (list of str): List of group names, each corresponding to a specific analysis group.
        channels (list of str): List of channels corresponding to the groups.
        n_cycle (int): Total number of cycles to process.
        n_tile_y (int): Number of tiles along the y-axis.
        n_tile_x (int): Number of tiles along the x-axis.
        scan_type (str): Type of scan, determining the tile layout.
        image_dir (str): Path to the main directory containing subfolders for each cycle, each with images organized by color, z, y, and x.
        subfooter (str, optional): String to append before the footer in the output CSV filename; defaults to an empty string.
        footer (str, optional): String appended to the output CSV filename; defaults to "_imagepath".

    Returns:
        None: The function creates a CSV file with image paths and associated metadata at the modified `zarr_path`.
    """
    # Define output path for image paths CSV
    imagepath_path = zarr_path.replace(
        ".zarr", subfooter + footer + ".csv")

    # Generate tile coordinates (y, x) based on scan type
    tile_yxs = get_tile_yx(n_tile_y, n_tile_x, scan_type)

    # Initialize lists to store data for each CSV column
    group_rows = []
    cycle_rows = []
    tile_y_rows = []
    tile_x_rows = []
    path_rows = []
    channel_rows = []

    # List and sort subdirectories in image_dir, assumed to be cycles
    sub_dirs = os.listdir(image_dir)
    sub_dirs = natural_sort(sub_dirs)

    # Iterate over each cycle directory
    for cycle, sub_dir in enumerate(sub_dirs):

        sub_img_dir = os.path.join(image_dir, sub_dir)
        files = os.listdir(sub_img_dir)
        # Filter and sort files with specified extension
        for group_name, channel in zip(groups, channels):
            files = [file for file in files if file.endswith(ext)]
            files = natural_sort(files)

            # Map each tile coordinate to a file path
            for tile_yx, file in zip(tile_yxs, files):
                tile_y, tile_x = tile_yx
                path = os.path.join(sub_img_dir, file)

                # Append data for each row
                group_rows.append(group_name)
                cycle_rows.append(cycle + 1)
                tile_y_rows.append(tile_y + 1)
                tile_x_rows.append(tile_x + 1)
                path_rows.append(path)
                channel_rows.append(channel)

    # Create a DataFrame and save the output CSV with image paths
    df = pd.DataFrame({
        "group": group_rows, "cycle": cycle_rows,
        "tile_y": tile_y_rows, "tile_x": tile_x_rows,
        "path": path_rows, "channel": channel_rows})

    df.to_csv(imagepath_path, index=False)




[docs]
def ims_cYXzyx(zarr_path, n_z, n_y, n_x, imagepath_footer="_imagepath"):
    """
    Creates empty Zarr arrays for image data in cycle, tile, and spatial (z, y, x) dimensions,
    then loads .ims images into these arrays using metadata from an image path CSV.

    Args:
        zarr_path (str): Path to the base .zarr file to store image data.
        n_z (int): Number of z-slices per tile.
        n_y (int): Image height (pixels) for each tile.
        n_x (int): Image width (pixels) for each tile.
        imagepath_footer (str, optional): String to append to the CSV filename; defaults to "_imagepath".

    Returns:
        None: The function creates Zarr arrays with image data and writes to `zarr_path`.
    """
    # Define the CSV path based on zarr_path
    imagepath_path = zarr_path.replace(".zarr", imagepath_footer + ".csv")
    # Load image paths and metadata from CSV
    df_imagepath = pd.read_csv(imagepath_path)

    # Determine the number of cycles, tile_y, and tile_x from the CSV data
    n_cycle = df_imagepath["cycle"].max()
    n_tile_y = df_imagepath["tile_y"].max()
    n_tile_x = df_imagepath["tile_x"].max()

    # Unique group names to create datasets for each group
    groups = df_imagepath["group"].unique()

    # Set array dimensions and coordinates for DataArray
    dims = ("cycle", "tile_y", "tile_x", "z", "y", "x")
    coords = {
        "cycle": np.arange(n_cycle),
        "tile_y": np.arange(n_tile_y), "tile_x": np.arange(n_tile_x),
        "z": np.arange(n_z), "y": np.arange(n_y), "x": np.arange(n_x), }
    # Define chunk sizes for optimal storage
    chunks = (1, 1, 1, n_z, n_y, n_x)

    # Initialize and save empty Zarr arrays for each group
    empty_data = da.zeros(
        (n_cycle, n_tile_y, n_tile_x, n_z, n_y, n_x),
        chunks=chunks, dtype=np.uint16)

    print("Saving empty images: ")
    with ProgressBar():
        for group in groups:
            xar = xr.DataArray(empty_data, dims=dims, coords=coords)
            ds = xar.to_dataset(name="data")
            ds.to_zarr(zarr_path, group=group + "/0", mode="w")

    # Define function to load .ims images into zarr array blocks
    def _load_ims_zyx(zar, df_group, block_info=None):
        # Get cycle and tile coordinates for the current block
        cycle = block_info[0]["chunk-location"][0]
        tile_y = block_info[0]["chunk-location"][1]
        tile_x = block_info[0]["chunk-location"][2]

        # Filter DataFrame to obtain metadata for the current block
        df_group = df_group[
            (df_group["cycle"] == cycle + 1) &
            (df_group["tile_y"] == tile_y + 1) &
            (df_group["tile_x"] == tile_x + 1)]

        # If no matching image is found, return a zero array
        if len(df_group) == 0:
            return np.zeros(zar.shape, dtype=np.uint16)

        # Load .ims file for the specified cycle, tile, and channel
        channel = df_group["channel"].values[0] - 1
        path = df_group["path"].values[0]
        img_ims = ims(path)

        # Check .ims image shape and adjust as needed
        if len(img_ims.shape) != 5:  # Expected shape: (zoom, channel, z, y, x)
            print("Unexpected shape " + str(img_ims.shape) + ": " + path)
            return np.zeros(zar.shape, dtype=np.uint16)
        if img_ims.shape[1] < channel + 1:
            print("No channel found: " + path)
            return np.zeros(zar.shape, dtype=np.uint16)
        img = img_ims[0][channel]
        if len(img.shape) == 2:
            img = np.expand_dims(img, axis=0)

        # Slice to fit specified dimensions and initialize output with zeros
        img = img[:n_z, :n_y, :n_x]
        output = np.zeros(zar.shape, dtype=np.uint16)
        output[:, :, :, :img.shape[0], :img.shape[1], :img.shape[2]] = img
        return output

    # Load and map images to the Zarr arrays for each group
    for group in groups:
        dar = da.from_zarr(zarr_path, component=group + "/0/data")
        print(f"Loading cYXzyx ims images: {group}" + show_resource())
        group_df = df_imagepath[df_imagepath["group"] == group]

        # Apply _load_ims_zyx function to each block
        res = da.map_blocks(_load_ims_zyx, dar, group_df, dtype=np.uint16)

        # Convert results to DataArray and save to Zarr with appropriate chunking
        with ProgressBar():
            out = xr.DataArray(res, dims=dims, coords=coords)
            out = out.to_dataset(name="data")
            chunks = {"cycle": 1, "tile_y": 1, "tile_x": 1,
                      "z": n_z, "y": n_y, "x": n_x}
            out = out.chunk(chunks=chunks)
            out.to_zarr(zarr_path, mode="w", group=group + "/0")




[docs]
def tif_cYXzyx(zarr_path, n_z, n_y, n_x, imagepath_footer="_imagepath",
               ext=".tif", dtype=None, tif_dims="czyx"):
    """
    Creates empty Zarr arrays for image data in cycle, tile, and spatial (z, y, x) dimensions,
    then loads TIFF images into these arrays using metadata from an image path CSV.

    Args:
        zarr_path (str): Path to the base .zarr file to store image data.
        n_z (int): Number of z-slices per tile.
        n_y (int): Image height (pixels) for each tile.
        n_x (int): Image width (pixels) for each tile.
        imagepath_footer (str, optional): String to append to the CSV filename; defaults to "_imagepath".
        ext (str, optional): File extension of the image files; defaults to ".tif".
        dtype (str, optional): Data type to cast the image to; defaults to None.
        dims (str, optional): Order of dimensions in the image data; defaults to "czyx".

    Returns:
        None: The function creates Zarr arrays with image data and writes to `zarr_path`.
    """
    # Define the CSV path based on zarr_path
    imagepath_path = zarr_path.replace(".zarr", imagepath_footer + ".csv")
    # Load image paths and metadata from CSV
    df_imagepath = pd.read_csv(imagepath_path)

    # Determine the number of cycles, tile_y, and tile_x from the CSV data
    n_cycle = df_imagepath["cycle"].max()
    n_tile_y = df_imagepath["tile_y"].max()
    n_tile_x = df_imagepath["tile_x"].max()

    # Unique group names to create datasets for each group
    groups = df_imagepath["group"].unique()

    # Set array dimensions and coordinates for DataArray
    dims = ("cycle", "tile_y", "tile_x", "z", "y", "x")
    coords = {
        "cycle": np.arange(n_cycle),
        "tile_y": np.arange(n_tile_y), "tile_x": np.arange(n_tile_x),
        "z": np.arange(n_z), "y": np.arange(n_y), "x": np.arange(n_x), }
    # Define chunk sizes for optimal storage
    chunks = (1, 1, 1, n_z, n_y, n_x)

    if dtype is None:
        dtype = np.uint16

    # Initialize and save empty Zarr arrays for each group
    empty_data = da.zeros(
        (n_cycle, n_tile_y, n_tile_x, n_z, n_y, n_x),
        chunks=chunks, dtype=dtype)

    print("Saving empty images: ")
    with ProgressBar():
        for group in groups:
            xar = xr.DataArray(empty_data, dims=dims, coords=coords)
            ds = xar.to_dataset(name="data")
            ds.to_zarr(zarr_path, group=group + "/0", mode="w")

    # Define function to load .ims images into zarr array blocks
    def _load_tif_zyx(zar, df_group, _dtype, block_info=None):
        # Get cycle and tile coordinates for the current block
        cycle = block_info[0]["chunk-location"][0]
        tile_y = block_info[0]["chunk-location"][1]
        tile_x = block_info[0]["chunk-location"][2]

        # Filter DataFrame to obtain metadata for the current block
        df_group = df_group[
            (df_group["cycle"] == cycle + 1) &
            (df_group["tile_y"] == tile_y + 1) &
            (df_group["tile_x"] == tile_x + 1)]

        # If no matching image is found, return a zero array
        if len(df_group) == 0:
            return np.zeros(zar.shape, dtype=_dtype)

        # Load tif file for the specified cycle, tile, and channel
        channel = df_group["channel"].values[0] - 1
        path = df_group["path"].values[0]
        img_tif = tifffile.imread(path)
        img_tif = img_tif.astype(_dtype)

        # Check tif image shape and adjust as needed
        # Expected shape: (c, y, x) or (c, z, y, x)
        if len(img_tif.shape) not in [3, 4]:
            print("Unexpected shape " + str(img_tif.shape) + ": " + path)
            return np.zeros(zar.shape, dtype=_dtype)
        if img_tif.shape[0] < channel + 1:
            print("No channel found: " + path)
            return np.zeros(zar.shape, dtype=_dtype)
        if tif_dims == "czyx":
            img = img_tif[channel]
        elif tif_dims == "zyxc":
            img = img_tif[:, :, :, channel]
        elif tif_dims == "cyx":
            img = img_tif[channel]
            img = np.expand_dims(img, axis=0)
        elif tif_dims == "yxc":
            img = img_tif[:, :, channel]
            img = np.expand_dims(img, axis=0)
        else:
            raise ValueError("Unsupported tif_dims")

        # Slice to fit specified dimensions and initialize output with zeros
        img = img[:n_z, :n_y, :n_x]
        output = np.zeros(zar.shape, dtype=_dtype)
        output[:, :, :, :img.shape[0], :img.shape[1], :img.shape[2]] = img
        return output

    # Load and map images to the Zarr arrays for each group
    for group in groups:
        dar = da.from_zarr(zarr_path, component=group + "/0/data")
        print(f"Loading cYXzyx tif images: {group}" + show_resource())
        group_df = df_imagepath[df_imagepath["group"] == group]

        # Apply _load_ims_zyx function to each block
        res = da.map_blocks(_load_tif_zyx, dar, group_df, dtype, dtype=dtype)

        # Convert results to DataArray and save to Zarr with appropriate chunking
        with ProgressBar():
            out = xr.DataArray(res, dims=dims, coords=coords)
            out = out.to_dataset(name="data")
            chunks = {"cycle": 1, "tile_y": 1, "tile_x": 1,
                      "z": n_z, "y": n_y, "x": n_x}
            out = out.chunk(chunks=chunks)
            out.to_zarr(zarr_path, mode="w", group=group + "/0")




[docs]
def stitched_ims(
        zarr_path, group, image_path, channel, n_tile_y, n_tile_x):
    """
    Processes a stitched image by splitting it into tiles and saving them in a Zarr array.

    Args:
        zarr_path (str): Path to the Zarr file where tiled data will be saved.
        group (str): Group name in the Zarr file for storing the tiled image data.
        image_path (str): Path to the stitched image file in .ims format.
        channel (int): Channel index for selecting specific image data.
        n_tile_y (int): Number of tiles along the y-axis.
        n_tile_x (int): Number of tiles along the x-axis.

    Returns:
        None: The function saves tiled images in the specified Zarr group without returning any value.
    """
    # Load stitched image and select specified channel
    print("Loading stitched image: " + image_path)
    stitched_img = ims(image_path)[0, channel]

    # If image has 3D shape, perform max projection across z-axis
    if len(stitched_img.shape) == 3:
        stitched_img = stitched_img.max(axis=0)

    # Define tile dimensions based on image size and tile count
    n_stitched_y, n_stitched_x = stitched_img.shape
    tile_y_size = n_stitched_y // n_tile_y
    tile_x_size = n_stitched_x // n_tile_x

    # Initialize array for storing individual tiles
    tiled_stitched = np.zeros((n_tile_y, n_tile_x, tile_y_size, tile_x_size))

    # Slice the stitched image into tiles and assign to tiled_stitched array
    for y in range(n_tile_y):
        for x in range(n_tile_x):
            tiled_stitched[y, x, :, :] = stitched_img[y * tile_y_size:(
                y + 1) * tile_y_size, x * tile_x_size:(x + 1) * tile_x_size]

    # Convert tiled image to xarray DataArray and configure coordinates and dimensions
    dims = ("tile_y", "tile_x", "y", "x")
    coords = {"tile_y": np.arange(n_tile_y),
              "tile_x": np.arange(n_tile_x),
              "y": np.arange(tile_y_size),
              "x": np.arange(tile_x_size), }

    # Set chunk sizes for storage
    tiled_stitched = xr.DataArray(tiled_stitched, dims=dims, coords=coords)
    tiled_stitched = tiled_stitched.chunk(
        {"tile_y": 1, "tile_x": 1, "y": tile_y_size, "x": tile_x_size})
    tiled_stitched = tiled_stitched.to_dataset(name="data")

    # Save the tiled DataArray to the specified Zarr group
    tiled_stitched.to_zarr(zarr_path, mode="w", group=group + "/0")




[docs]
def stitched_tif(
        zarr_path, group, image_path, n_tile_y, n_tile_x, dtype="uint16"):
    """
    Processes a stitched TIFF image by splitting it into tiles and saving them in a Zarr array.

    Args:
        zarr_path (str): Path to the Zarr file where tiled data will be saved.
        group (str): Group name in the Zarr file for storing the tiled image data.
        image_path (str): Path to the stitched image file in TIFF format.
        n_tile_y (int): Number of tiles along the y-axis.
        n_tile_x (int): Number of tiles along the x-axis.
        dtype (str, optional): Data type to cast the image to; defaults to "uint16".

    Returns:
        None: The function saves tiled images in the specified Zarr group without returning any value.
    """
    # Load stitched TIFF image and cast to specified dtype
    print("Loading stitched tif image: " + image_path)
    stitched_img = tifffile.imread(image_path)
    stitched_img = stitched_img.astype(dtype)

    # If image has 3D shape, perform max projection across z-axis
    if len(stitched_img.shape) == 3:
        stitched_img = stitched_img.max(axis=0)

    # Define tile dimensions based on image size and tile count
    n_stitched_y, n_stitched_x = stitched_img.shape
    tile_y_size = n_stitched_y // n_tile_y
    tile_x_size = n_stitched_x // n_tile_x

    # Initialize array for storing individual tiles
    tiled_stitched = np.zeros((n_tile_y, n_tile_x, tile_y_size, tile_x_size))

    # Slice the stitched image into tiles and assign to tiled_stitched array
    for y in range(n_tile_y):
        for x in range(n_tile_x):
            tiled_stitched[y, x, :, :] = stitched_img[y * tile_y_size:(
                y + 1) * tile_y_size, x * tile_x_size:(x + 1) * tile_x_size]

    # Convert tiled image to xarray DataArray and configure coordinates and dimensions
    dims = ("tile_y", "tile_x", "y", "x")
    coords = {"tile_y": np.arange(n_tile_y),
              "tile_x": np.arange(n_tile_x),
              "y": np.arange(tile_y_size),
              "x": np.arange(tile_x_size), }

    tiled_stitched = xr.DataArray(tiled_stitched, dims=dims, coords=coords)

    # Set chunk sizes for storage
    tiled_stitched = tiled_stitched.chunk(
        {"tile_y": 1, "tile_x": 1, "y": tile_y_size, "x": tile_x_size})
    tiled_stitched = tiled_stitched.to_dataset(name="data")

    # Save the tiled DataArray to the specified Zarr group
    tiled_stitched.to_zarr(zarr_path, mode="w", group=group + "/0")