Source code for dicom_csv.crawler

"""Contains functions for gathering metadata from individual DICOM files or entire directories."""
import logging
import os
import struct
from pathlib import Path
from typing import Sequence, Iterable

import pandas as pd
from tqdm import tqdm
from pydicom import valuerep, errors, dcmread, Dataset, DataElement, sequence
from pydicom.uid import ImplicitVRLittleEndian

from .convert import is_volumetric_ct, split_volume
from .utils import PathLike

__all__ = 'get_file_meta', 'join_tree'

SERIAL = {'ImagePositionPatient', 'ImageOrientationPatient', 'PixelSpacing'}
PERSON_CLASS = valuerep.PersonName

logger = logging.getLogger(__name__)


def _throw(e):
    raise e


def read_dicom(path: PathLike, force: bool = False):
    try:
        return True, dcmread(str(path))
    except errors.InvalidDicomError:
        if force:
            dc = dcmread(str(path), force=True)
            dc.file_meta.TransferSyntaxUID = ImplicitVRLittleEndian
            return True, dc

        raise


def iter_private_tags(ds: Dataset) -> Iterable[DataElement]:
    ds.__repr__() # https://github.com/pydicom/pydicom/issues/1805
    for tag in ds.values():
        if tag.is_private:
            yield tag


[docs]def get_file_meta(path: PathLike, force: bool = True, read_pixel_array: bool = False,
                  unpack_volumetric: bool = False, extract_private: bool = False) -> Iterable[dict]:
    """
    Get a dict containing the metadata from the DICOM file located at ``path``.

    Parameters
    ---

    path - PathLike,
        full path to file

    force - bool,
        pydicom.filereader.dcmread force parameter, default is False

    read_pixel_array - bool,
        if True, crawler will add information about DICOM pixel_array, False significantly increases crawling time,
        default is True.

    Notes
    ---
    The following keys are added:
        | NoError: whether an exception was raised during reading the file.
        | HasPixelArray: (if NoError is True) whether the file contains a pixel array.
        | PixelArrayShape: (if HasPixelArray is True) the shape of the pixel array.


    For some formats the following packages might be required:
        >>> conda install -c glueviz gdcm # Python 3.5 and 3.6
        >>> conda install -c conda-forge gdcm # Python 3.7
    """
    try:
        no_error, instance = read_dicom(path, force)
    except (errors.InvalidDicomError, struct.error, OSError, NotImplementedError, AttributeError, KeyError):
        yield {'NoError': False}
        return

    if unpack_volumetric and is_volumetric_ct(instance, errors=False):
        instances = split_volume(instance)
    else:
        instances = [instance]

    for instance in instances:
        result = extract_meta(instance, read_pixel_array, extract_private)
        result.setdefault('NoError', True)
        yield result


def extract_meta(instance: Dataset, read_pixel_array: bool = False, extract_private: bool = False) -> dict:
    result = {}
    if read_pixel_array:
        try:
            has_px = hasattr(instance, 'pixel_array')
        except (TypeError, NotImplementedError):
            has_px = False
        except (ValueError, RuntimeError):
            has_px = True
            result['NoError'] = False

        # TODO: 7FE0?
        result['HasPixelArray'] = has_px

    for attr in instance.dir():
        try:
            value = instance.get(attr)
        except BaseException as e:
            logger.debug(f'Exception while accessing key "{attr}": {e.__class__.__name__} {e}')
            continue
        if value is None:
            continue

        if isinstance(value, PERSON_CLASS):
            result[attr] = str(value)

        elif isinstance(value, (int, float, str)):
            result[attr] = value

        elif attr in SERIAL:
            for pos, num in enumerate(value):
                result[f'{attr}{pos}'] = num

    if extract_private:
        for private_tag in iter_private_tags(instance):
            if isinstance(private_tag, sequence.Sequence):
                pass
            if private_tag.VR not in valuerep.LONG_VALUE_VR:
                value = instance.get(private_tag.tag).value
                if isinstance(value, (int, float)):
                    result[private_tag.name] = value
                if isinstance(value, str):
                    result[private_tag.name] = value[:100] # just in case

    return result


[docs]def join_tree(top: PathLike, ignore_extensions: Sequence[str] = (), relative: bool = True, verbose: int = 0,
              read_pixel_array: bool = False, force: bool = True, unpack_volumetric: bool = True, extract_private: bool = False,
              total: bool = False) -> pd.DataFrame:
    """
    Returns a dataframe containing metadata for each file in all the subfolders of ``top``.

    Parameters
    ----------
    top: PathLike
        path to crawled folder
    ignore_extensions: Sequence
        list of extensions to skip during crawling
    relative: bool
        whether the ``PathToFolder`` attribute should be relative to ``top`` default is True.
    verbose: int
        the verbosity level:
            | 0 - no progressbar
            | 1 - progressbar with iterations count
            | 2 - progressbar with filenames
    total: bool
        whether to show the total number of files in the progressbar.
        This is adds a bit of overhead, because each file will be visited a second time (without being opened).

    References
    ----------
    See the :doc:`tutorials/dicom` tutorial for more details.

    Notes
    -----
    The following columns are added:
        | NoError: whether an exception was raised during reading the file.
        | HasPixelArray:(if NoError is True) whether the file contains a pixel array(added if read_pixel_array is True).
        | PixelArrayShape: (if HasPixelArray is True) the shape of the pixel array (added if read_pixel_array is True).
        | PathToFolder
        | FileName

    For some formats the following packages might be required:
        >>> conda install -c glueviz gdcm # Python 3.5 and 3.6
        >>> conda install -c conda-forge gdcm # Python 3.7
    """
    for extension in ignore_extensions:
        if not extension.startswith('.'):
            raise ValueError(f'Each extension must start with a dot: "{extension}".')

    n_files = None
    if total and verbose:
        n_files = 0
        bar = tqdm(desc='Counting files')
        for root, _, files in os.walk(top, onerror=_throw, followlinks=True):
            for filename in files:
                if not any(filename.endswith(ext) for ext in ignore_extensions):
                    n_files += 1
                    bar.update()
        bar.close()

    result = []
    bar = tqdm(disable=not verbose, total=n_files)
    for root, _, files in os.walk(top, onerror=_throw, followlinks=True):
        root = Path(root)
        rel_path = root.relative_to(top)

        for filename in files:
            if any(filename.endswith(ext) for ext in ignore_extensions):
                continue

            bar.update()
            if verbose > 1:
                bar.set_description(str(rel_path / filename))

            for entry in get_file_meta(root / filename, force=force, read_pixel_array=read_pixel_array,
                                       unpack_volumetric=unpack_volumetric, extract_private=extract_private):
                entry['PathToFolder'] = str(rel_path if relative else root)
                entry['FileName'] = filename
                result.append(entry)

    return pd.DataFrame(result)