"""Contains functions for gathering metadata from individual DICOM files or entire directories."""
import logging
import os
import struct
from pathlib import Path
from typing import Sequence, Iterable
import pandas as pd
from tqdm import tqdm
from pydicom import valuerep, errors, dcmread, Dataset, DataElement, sequence
from pydicom.uid import ImplicitVRLittleEndian
from .convert import is_volumetric_ct, split_volume
from .utils import PathLike
__all__ = 'get_file_meta', 'join_tree'
SERIAL = {'ImagePositionPatient', 'ImageOrientationPatient', 'PixelSpacing'}
PERSON_CLASS = valuerep.PersonName
logger = logging.getLogger(__name__)
def _throw(e):
raise e
def read_dicom(path: PathLike, force: bool = False):
try:
return True, dcmread(str(path))
except errors.InvalidDicomError:
if force:
dc = dcmread(str(path), force=True)
dc.file_meta.TransferSyntaxUID = ImplicitVRLittleEndian
return True, dc
raise
def iter_private_tags(ds: Dataset) -> Iterable[DataElement]:
ds.__repr__() # https://github.com/pydicom/pydicom/issues/1805
for tag in ds.values():
if tag.is_private:
yield tag
def extract_meta(instance: Dataset, read_pixel_array: bool = False, extract_private: bool = False) -> dict:
result = {}
if read_pixel_array:
try:
has_px = hasattr(instance, 'pixel_array')
except (TypeError, NotImplementedError):
has_px = False
except (ValueError, RuntimeError):
has_px = True
result['NoError'] = False
# TODO: 7FE0?
result['HasPixelArray'] = has_px
for attr in instance.dir():
try:
value = instance.get(attr)
except BaseException as e:
logger.debug(f'Exception while accessing key "{attr}": {e.__class__.__name__} {e}')
continue
if value is None:
continue
if isinstance(value, PERSON_CLASS):
result[attr] = str(value)
elif isinstance(value, (int, float, str)):
result[attr] = value
elif attr in SERIAL:
for pos, num in enumerate(value):
result[f'{attr}{pos}'] = num
if extract_private:
for private_tag in iter_private_tags(instance):
if isinstance(private_tag, sequence.Sequence):
pass
if private_tag.VR not in valuerep.LONG_VALUE_VR:
value = instance.get(private_tag.tag).value
if isinstance(value, (int, float)):
result[private_tag.name] = value
if isinstance(value, str):
result[private_tag.name] = value[:100] # just in case
return result
[docs]def join_tree(top: PathLike, ignore_extensions: Sequence[str] = (), relative: bool = True, verbose: int = 0,
read_pixel_array: bool = False, force: bool = True, unpack_volumetric: bool = True, extract_private: bool = False,
total: bool = False) -> pd.DataFrame:
"""
Returns a dataframe containing metadata for each file in all the subfolders of ``top``.
Parameters
----------
top: PathLike
path to crawled folder
ignore_extensions: Sequence
list of extensions to skip during crawling
relative: bool
whether the ``PathToFolder`` attribute should be relative to ``top`` default is True.
verbose: int
the verbosity level:
| 0 - no progressbar
| 1 - progressbar with iterations count
| 2 - progressbar with filenames
total: bool
whether to show the total number of files in the progressbar.
This is adds a bit of overhead, because each file will be visited a second time (without being opened).
References
----------
See the :doc:`tutorials/dicom` tutorial for more details.
Notes
-----
The following columns are added:
| NoError: whether an exception was raised during reading the file.
| HasPixelArray:(if NoError is True) whether the file contains a pixel array(added if read_pixel_array is True).
| PixelArrayShape: (if HasPixelArray is True) the shape of the pixel array (added if read_pixel_array is True).
| PathToFolder
| FileName
For some formats the following packages might be required:
>>> conda install -c glueviz gdcm # Python 3.5 and 3.6
>>> conda install -c conda-forge gdcm # Python 3.7
"""
for extension in ignore_extensions:
if not extension.startswith('.'):
raise ValueError(f'Each extension must start with a dot: "{extension}".')
n_files = None
if total and verbose:
n_files = 0
bar = tqdm(desc='Counting files')
for root, _, files in os.walk(top, onerror=_throw, followlinks=True):
for filename in files:
if not any(filename.endswith(ext) for ext in ignore_extensions):
n_files += 1
bar.update()
bar.close()
result = []
bar = tqdm(disable=not verbose, total=n_files)
for root, _, files in os.walk(top, onerror=_throw, followlinks=True):
root = Path(root)
rel_path = root.relative_to(top)
for filename in files:
if any(filename.endswith(ext) for ext in ignore_extensions):
continue
bar.update()
if verbose > 1:
bar.set_description(str(rel_path / filename))
for entry in get_file_meta(root / filename, force=force, read_pixel_array=read_pixel_array,
unpack_volumetric=unpack_volumetric, extract_private=extract_private):
entry['PathToFolder'] = str(rel_path if relative else root)
entry['FileName'] = filename
result.append(entry)
return pd.DataFrame(result)