Source code for megafish.load

import os

import dask.config
import numpy as np
import pandas as pd
from tqdm import tqdm
import zarr
import xarray as xr
import dask
from dask.diagnostics import ProgressBar
import dask.array as da
from imaris_ims_file_reader.ims import ims
import tifffile

from .utils import natural_sort, get_tile_yx
from .config import show_resource


[docs] def make_dirlist(dirlist_path, image_dir): """ Generates a CSV file listing all cycle directories within a specified image directory. Each cycle directory must contain tiled images organized by color, z, y, and x. Args: dirlist_path (str): The file path to save the generated directory list CSV. image_dir (str): The path to the main directory containing subfolders for each cycle. Returns: None: The function creates a CSV file at dirlist_path. """ dirs = os.listdir(image_dir) dirs = [os.path.join(image_dir, dir_) for dir_ in dirs] dirs = natural_sort(dirs) dirs = [dir_ for dir_ in dirs if os.path.isdir(dir_)] df = pd.DataFrame({"folder": dirs}) if not os.path.exists(os.path.dirname(dirlist_path)): os.makedirs(os.path.dirname(dirlist_path)) df.to_csv(dirlist_path, index=False)
[docs] def make_imagepath_cYX_from_dirlist( zarr_path, groups, channels, n_cycle, n_tile_y, n_tile_x, scan_type, dirlist_path, subfooter="", footer="_imagepath", ext=".tif"): """ Generates a CSV file mapping image paths to cycle, tile, and channel information for spatial-omics images. Args: zarr_path (str): Path to the base .zarr file. groups (list of str): List of group names, each corresponding to a specific analysis group. channels (list of str): List of channels corresponding to the groups. n_cycle (int): Total number of cycles to process. n_tile_y (int): Number of tiles along the y-axis. n_tile_x (int): Number of tiles along the x-axis. scan_type (str): Type of scan, determining the tile layout. dirlist_path (str): Path to the CSV file with the list of cycle directories. subfooter (str, optional): String to append before the footer in the output CSV filename; defaults to an empty string. footer (str, optional): String appended to the output CSV filename; defaults to "_imagepath". ext (str, optional): File extension of the image files; defaults to ".tif". Returns: None: The function creates a CSV file with the generated image paths and associated metadata at the modified `zarr_path`. """ # Define output path for image paths CSV imagepath_path = zarr_path.replace( ".zarr", subfooter + footer + ".csv") # Generate a list of tile coordinates (y, x) based on scan type tile_yxs = get_tile_yx(n_tile_y, n_tile_x, scan_type) # Initialize lists to store data for each CSV column group_rows = [] cycle_rows = [] tile_y_rows = [] tile_x_rows = [] path_rows = [] channel_rows = [] # Read the list of cycle directories from dirlist_path CSV df_dirlist = pd.read_csv(dirlist_path) dirs = df_dirlist["folder"].values # Iterate over each cycle directory for cycle, dir_ in enumerate(dirs): # List all files in the cycle directory files = os.listdir(dir_) for group_name, channel in zip(groups, channels): # Filter and sort files with specified extension files = [file for file in files if file.endswith(ext)] files = natural_sort(files) # Map each tile coordinate to a file path for tile_yx, file in zip(tile_yxs, files): tile_y, tile_x = tile_yx path = os.path.join(dir_, file) # Append data for each row group_rows.append(group_name) cycle_rows.append(cycle + 1) tile_y_rows.append(tile_y + 1) tile_x_rows.append(tile_x + 1) path_rows.append(path) channel_rows.append(channel) # Create a DataFrame and save the output CSV with image paths df = pd.DataFrame({ "group": group_rows, "cycle": cycle_rows, "tile_y": tile_y_rows, "tile_x": tile_x_rows, "path": path_rows, "channel": channel_rows}) df.to_csv(imagepath_path, index=False)
[docs] def make_imagepath_cYX( zarr_path, groups, channels, n_cycle, n_tile_y, n_tile_x, scan_type, image_dir, subfooter="", footer="_imagepath", ext=".ims"): """ Generates a CSV file mapping image paths to cycle, tile, and channel information for spatial-omics data. Args: zarr_path (str): Path to the base .zarr file. groups (list of str): List of group names, each corresponding to a specific analysis group. channels (list of str): List of channels corresponding to the groups. n_cycle (int): Total number of cycles to process. n_tile_y (int): Number of tiles along the y-axis. n_tile_x (int): Number of tiles along the x-axis. scan_type (str): Type of scan, determining the tile layout. image_dir (str): Path to the main directory containing subfolders for each cycle, each with images organized by color, z, y, and x. subfooter (str, optional): String to append before the footer in the output CSV filename; defaults to an empty string. footer (str, optional): String appended to the output CSV filename; defaults to "_imagepath". Returns: None: The function creates a CSV file with image paths and associated metadata at the modified `zarr_path`. """ # Define output path for image paths CSV imagepath_path = zarr_path.replace( ".zarr", subfooter + footer + ".csv") # Generate tile coordinates (y, x) based on scan type tile_yxs = get_tile_yx(n_tile_y, n_tile_x, scan_type) # Initialize lists to store data for each CSV column group_rows = [] cycle_rows = [] tile_y_rows = [] tile_x_rows = [] path_rows = [] channel_rows = [] # List and sort subdirectories in image_dir, assumed to be cycles sub_dirs = os.listdir(image_dir) sub_dirs = natural_sort(sub_dirs) # Iterate over each cycle directory for cycle, sub_dir in enumerate(sub_dirs): sub_img_dir = os.path.join(image_dir, sub_dir) files = os.listdir(sub_img_dir) # Filter and sort files with specified extension for group_name, channel in zip(groups, channels): files = [file for file in files if file.endswith(ext)] files = natural_sort(files) # Map each tile coordinate to a file path for tile_yx, file in zip(tile_yxs, files): tile_y, tile_x = tile_yx path = os.path.join(sub_img_dir, file) # Append data for each row group_rows.append(group_name) cycle_rows.append(cycle + 1) tile_y_rows.append(tile_y + 1) tile_x_rows.append(tile_x + 1) path_rows.append(path) channel_rows.append(channel) # Create a DataFrame and save the output CSV with image paths df = pd.DataFrame({ "group": group_rows, "cycle": cycle_rows, "tile_y": tile_y_rows, "tile_x": tile_x_rows, "path": path_rows, "channel": channel_rows}) df.to_csv(imagepath_path, index=False)
[docs] def ims_cYXzyx(zarr_path, n_z, n_y, n_x, imagepath_footer="_imagepath"): """ Creates empty Zarr arrays for image data in cycle, tile, and spatial (z, y, x) dimensions, then loads .ims images into these arrays using metadata from an image path CSV. Args: zarr_path (str): Path to the base .zarr file to store image data. n_z (int): Number of z-slices per tile. n_y (int): Image height (pixels) for each tile. n_x (int): Image width (pixels) for each tile. imagepath_footer (str, optional): String to append to the CSV filename; defaults to "_imagepath". Returns: None: The function creates Zarr arrays with image data and writes to `zarr_path`. """ # Define the CSV path based on zarr_path imagepath_path = zarr_path.replace(".zarr", imagepath_footer + ".csv") # Load image paths and metadata from CSV df_imagepath = pd.read_csv(imagepath_path) # Determine the number of cycles, tile_y, and tile_x from the CSV data n_cycle = df_imagepath["cycle"].max() n_tile_y = df_imagepath["tile_y"].max() n_tile_x = df_imagepath["tile_x"].max() # Unique group names to create datasets for each group groups = df_imagepath["group"].unique() # Set array dimensions and coordinates for DataArray dims = ("cycle", "tile_y", "tile_x", "z", "y", "x") coords = { "cycle": np.arange(n_cycle), "tile_y": np.arange(n_tile_y), "tile_x": np.arange(n_tile_x), "z": np.arange(n_z), "y": np.arange(n_y), "x": np.arange(n_x), } # Define chunk sizes for optimal storage chunks = (1, 1, 1, n_z, n_y, n_x) # Initialize and save empty Zarr arrays for each group empty_data = da.zeros( (n_cycle, n_tile_y, n_tile_x, n_z, n_y, n_x), chunks=chunks, dtype=np.uint16) print("Saving empty images: ") with ProgressBar(): for group in groups: xar = xr.DataArray(empty_data, dims=dims, coords=coords) ds = xar.to_dataset(name="data") ds.to_zarr(zarr_path, group=group + "/0", mode="w") # Define function to load .ims images into zarr array blocks def _load_ims_zyx(zar, df_group, block_info=None): # Get cycle and tile coordinates for the current block cycle = block_info[0]["chunk-location"][0] tile_y = block_info[0]["chunk-location"][1] tile_x = block_info[0]["chunk-location"][2] # Filter DataFrame to obtain metadata for the current block df_group = df_group[ (df_group["cycle"] == cycle + 1) & (df_group["tile_y"] == tile_y + 1) & (df_group["tile_x"] == tile_x + 1)] # If no matching image is found, return a zero array if len(df_group) == 0: return np.zeros(zar.shape, dtype=np.uint16) # Load .ims file for the specified cycle, tile, and channel channel = df_group["channel"].values[0] - 1 path = df_group["path"].values[0] img_ims = ims(path) # Check .ims image shape and adjust as needed if len(img_ims.shape) != 5: # Expected shape: (zoom, channel, z, y, x) print("Unexpected shape " + str(img_ims.shape) + ": " + path) return np.zeros(zar.shape, dtype=np.uint16) if img_ims.shape[1] < channel + 1: print("No channel found: " + path) return np.zeros(zar.shape, dtype=np.uint16) img = img_ims[0][channel] if len(img.shape) == 2: img = np.expand_dims(img, axis=0) # Slice to fit specified dimensions and initialize output with zeros img = img[:n_z, :n_y, :n_x] output = np.zeros(zar.shape, dtype=np.uint16) output[:, :, :, :img.shape[0], :img.shape[1], :img.shape[2]] = img return output # Load and map images to the Zarr arrays for each group for group in groups: dar = da.from_zarr(zarr_path, component=group + "/0/data") print(f"Loading cYXzyx ims images: {group}" + show_resource()) group_df = df_imagepath[df_imagepath["group"] == group] # Apply _load_ims_zyx function to each block res = da.map_blocks(_load_ims_zyx, dar, group_df, dtype=np.uint16) # Convert results to DataArray and save to Zarr with appropriate chunking with ProgressBar(): out = xr.DataArray(res, dims=dims, coords=coords) out = out.to_dataset(name="data") chunks = {"cycle": 1, "tile_y": 1, "tile_x": 1, "z": n_z, "y": n_y, "x": n_x} out = out.chunk(chunks=chunks) out.to_zarr(zarr_path, mode="w", group=group + "/0")
[docs] def tif_cYXzyx(zarr_path, n_z, n_y, n_x, imagepath_footer="_imagepath", ext=".tif", dtype=None, tif_dims="czyx"): """ Creates empty Zarr arrays for image data in cycle, tile, and spatial (z, y, x) dimensions, then loads TIFF images into these arrays using metadata from an image path CSV. Args: zarr_path (str): Path to the base .zarr file to store image data. n_z (int): Number of z-slices per tile. n_y (int): Image height (pixels) for each tile. n_x (int): Image width (pixels) for each tile. imagepath_footer (str, optional): String to append to the CSV filename; defaults to "_imagepath". ext (str, optional): File extension of the image files; defaults to ".tif". dtype (str, optional): Data type to cast the image to; defaults to None. dims (str, optional): Order of dimensions in the image data; defaults to "czyx". Returns: None: The function creates Zarr arrays with image data and writes to `zarr_path`. """ # Define the CSV path based on zarr_path imagepath_path = zarr_path.replace(".zarr", imagepath_footer + ".csv") # Load image paths and metadata from CSV df_imagepath = pd.read_csv(imagepath_path) # Determine the number of cycles, tile_y, and tile_x from the CSV data n_cycle = df_imagepath["cycle"].max() n_tile_y = df_imagepath["tile_y"].max() n_tile_x = df_imagepath["tile_x"].max() # Unique group names to create datasets for each group groups = df_imagepath["group"].unique() # Set array dimensions and coordinates for DataArray dims = ("cycle", "tile_y", "tile_x", "z", "y", "x") coords = { "cycle": np.arange(n_cycle), "tile_y": np.arange(n_tile_y), "tile_x": np.arange(n_tile_x), "z": np.arange(n_z), "y": np.arange(n_y), "x": np.arange(n_x), } # Define chunk sizes for optimal storage chunks = (1, 1, 1, n_z, n_y, n_x) if dtype is None: dtype = np.uint16 # Initialize and save empty Zarr arrays for each group empty_data = da.zeros( (n_cycle, n_tile_y, n_tile_x, n_z, n_y, n_x), chunks=chunks, dtype=dtype) print("Saving empty images: ") with ProgressBar(): for group in groups: xar = xr.DataArray(empty_data, dims=dims, coords=coords) ds = xar.to_dataset(name="data") ds.to_zarr(zarr_path, group=group + "/0", mode="w") # Define function to load .ims images into zarr array blocks def _load_tif_zyx(zar, df_group, _dtype, block_info=None): # Get cycle and tile coordinates for the current block cycle = block_info[0]["chunk-location"][0] tile_y = block_info[0]["chunk-location"][1] tile_x = block_info[0]["chunk-location"][2] # Filter DataFrame to obtain metadata for the current block df_group = df_group[ (df_group["cycle"] == cycle + 1) & (df_group["tile_y"] == tile_y + 1) & (df_group["tile_x"] == tile_x + 1)] # If no matching image is found, return a zero array if len(df_group) == 0: return np.zeros(zar.shape, dtype=_dtype) # Load tif file for the specified cycle, tile, and channel channel = df_group["channel"].values[0] - 1 path = df_group["path"].values[0] img_tif = tifffile.imread(path) img_tif = img_tif.astype(_dtype) # Check tif image shape and adjust as needed # Expected shape: (c, y, x) or (c, z, y, x) if len(img_tif.shape) not in [3, 4]: print("Unexpected shape " + str(img_tif.shape) + ": " + path) return np.zeros(zar.shape, dtype=_dtype) if img_tif.shape[0] < channel + 1: print("No channel found: " + path) return np.zeros(zar.shape, dtype=_dtype) if tif_dims == "czyx": img = img_tif[channel] elif tif_dims == "zyxc": img = img_tif[:, :, :, channel] elif tif_dims == "cyx": img = img_tif[channel] img = np.expand_dims(img, axis=0) elif tif_dims == "yxc": img = img_tif[:, :, channel] img = np.expand_dims(img, axis=0) else: raise ValueError("Unsupported tif_dims") # Slice to fit specified dimensions and initialize output with zeros img = img[:n_z, :n_y, :n_x] output = np.zeros(zar.shape, dtype=_dtype) output[:, :, :, :img.shape[0], :img.shape[1], :img.shape[2]] = img return output # Load and map images to the Zarr arrays for each group for group in groups: dar = da.from_zarr(zarr_path, component=group + "/0/data") print(f"Loading cYXzyx tif images: {group}" + show_resource()) group_df = df_imagepath[df_imagepath["group"] == group] # Apply _load_ims_zyx function to each block res = da.map_blocks(_load_tif_zyx, dar, group_df, dtype, dtype=dtype) # Convert results to DataArray and save to Zarr with appropriate chunking with ProgressBar(): out = xr.DataArray(res, dims=dims, coords=coords) out = out.to_dataset(name="data") chunks = {"cycle": 1, "tile_y": 1, "tile_x": 1, "z": n_z, "y": n_y, "x": n_x} out = out.chunk(chunks=chunks) out.to_zarr(zarr_path, mode="w", group=group + "/0")
[docs] def stitched_ims( zarr_path, group, image_path, channel, n_tile_y, n_tile_x): """ Processes a stitched image by splitting it into tiles and saving them in a Zarr array. Args: zarr_path (str): Path to the Zarr file where tiled data will be saved. group (str): Group name in the Zarr file for storing the tiled image data. image_path (str): Path to the stitched image file in .ims format. channel (int): Channel index for selecting specific image data. n_tile_y (int): Number of tiles along the y-axis. n_tile_x (int): Number of tiles along the x-axis. Returns: None: The function saves tiled images in the specified Zarr group without returning any value. """ # Load stitched image and select specified channel print("Loading stitched image: " + image_path) stitched_img = ims(image_path)[0, channel] # If image has 3D shape, perform max projection across z-axis if len(stitched_img.shape) == 3: stitched_img = stitched_img.max(axis=0) # Define tile dimensions based on image size and tile count n_stitched_y, n_stitched_x = stitched_img.shape tile_y_size = n_stitched_y // n_tile_y tile_x_size = n_stitched_x // n_tile_x # Initialize array for storing individual tiles tiled_stitched = np.zeros((n_tile_y, n_tile_x, tile_y_size, tile_x_size)) # Slice the stitched image into tiles and assign to tiled_stitched array for y in range(n_tile_y): for x in range(n_tile_x): tiled_stitched[y, x, :, :] = stitched_img[y * tile_y_size:( y + 1) * tile_y_size, x * tile_x_size:(x + 1) * tile_x_size] # Convert tiled image to xarray DataArray and configure coordinates and dimensions dims = ("tile_y", "tile_x", "y", "x") coords = {"tile_y": np.arange(n_tile_y), "tile_x": np.arange(n_tile_x), "y": np.arange(tile_y_size), "x": np.arange(tile_x_size), } # Set chunk sizes for storage tiled_stitched = xr.DataArray(tiled_stitched, dims=dims, coords=coords) tiled_stitched = tiled_stitched.chunk( {"tile_y": 1, "tile_x": 1, "y": tile_y_size, "x": tile_x_size}) tiled_stitched = tiled_stitched.to_dataset(name="data") # Save the tiled DataArray to the specified Zarr group tiled_stitched.to_zarr(zarr_path, mode="w", group=group + "/0")
[docs] def stitched_tif( zarr_path, group, image_path, n_tile_y, n_tile_x, dtype="uint16"): """ Processes a stitched TIFF image by splitting it into tiles and saving them in a Zarr array. Args: zarr_path (str): Path to the Zarr file where tiled data will be saved. group (str): Group name in the Zarr file for storing the tiled image data. image_path (str): Path to the stitched image file in TIFF format. n_tile_y (int): Number of tiles along the y-axis. n_tile_x (int): Number of tiles along the x-axis. dtype (str, optional): Data type to cast the image to; defaults to "uint16". Returns: None: The function saves tiled images in the specified Zarr group without returning any value. """ # Load stitched TIFF image and cast to specified dtype print("Loading stitched tif image: " + image_path) stitched_img = tifffile.imread(image_path) stitched_img = stitched_img.astype(dtype) # If image has 3D shape, perform max projection across z-axis if len(stitched_img.shape) == 3: stitched_img = stitched_img.max(axis=0) # Define tile dimensions based on image size and tile count n_stitched_y, n_stitched_x = stitched_img.shape tile_y_size = n_stitched_y // n_tile_y tile_x_size = n_stitched_x // n_tile_x # Initialize array for storing individual tiles tiled_stitched = np.zeros((n_tile_y, n_tile_x, tile_y_size, tile_x_size)) # Slice the stitched image into tiles and assign to tiled_stitched array for y in range(n_tile_y): for x in range(n_tile_x): tiled_stitched[y, x, :, :] = stitched_img[y * tile_y_size:( y + 1) * tile_y_size, x * tile_x_size:(x + 1) * tile_x_size] # Convert tiled image to xarray DataArray and configure coordinates and dimensions dims = ("tile_y", "tile_x", "y", "x") coords = {"tile_y": np.arange(n_tile_y), "tile_x": np.arange(n_tile_x), "y": np.arange(tile_y_size), "x": np.arange(tile_x_size), } tiled_stitched = xr.DataArray(tiled_stitched, dims=dims, coords=coords) # Set chunk sizes for storage tiled_stitched = tiled_stitched.chunk( {"tile_y": 1, "tile_x": 1, "y": tile_y_size, "x": tile_x_size}) tiled_stitched = tiled_stitched.to_dataset(name="data") # Save the tiled DataArray to the specified Zarr group tiled_stitched.to_zarr(zarr_path, mode="w", group=group + "/0")