I have a folder structure with some epubs and json files in the down-most folders (not counting the .ts folders). I’m exporting tags from the json files to tagspaces, by creating a .ts folder with some other json files. I’ve already processed part of the files and now I want to find the leaf folders that don’t have a .ts folder in their path, to find the remaining files without having to process the others twice.

I want to process the files in the directories as I find them instead of getting a list of directories and then looping through them. On the example below I’ve returned the list of directories only to be able to test it.

So for this example I only want to do something for the folder t5:

test
├── t1
│   ├── t2
│   │   └── t5
│   └── t3
│       └── .ts
└── .ts
    └── t4

This is what I’ve tried:

import os
import shutil
from typing import List

def process_files_in_leaf_subdirectories(dir: str) -> List[str]:
    dirs = []
    for root, subdirs, filenames in os.walk(dir):
        if subdirs or '.ts' in root:
            continue
        dirs.append(root)
    return dirs


def test_process_files_in_leaf_subdirectories():
    os.makedirs('tmp/t1/t2/t5', exist_ok=True)
    os.makedirs('tmp/t1/t3/.ts', exist_ok=True)
    os.makedirs('tmp/.ts/t4', exist_ok=True)
    assert get_files_in_leaf_subdirectories('tmp') == ['tmp/t1/t2/t5']
    shutil.rmtree('tmp')

The next example works fine but it gets the list of directories instead of processing the files as they are found:

import os
import shutil
from pathlib import Path
from typing import List

def process_files_in_leaf_dir(leaves: List[Path]) -> List[str]:
    files = []
    for dir in leaves:
        for meta_file in dir.glob("*.json"):
            files.append(meta_file)
    return files

def find_leaf_dirs(root_path: Path) -> Path:
    # filter subdirectories
    child_dirs = [path for path in root_path.iterdir() if path.is_dir()]

    # if no child_dir, yield & return
    if not child_dirs:
        yield root_path
        return
    
    # otherwise iter tru subdir
    for path in child_dirs:
        # ignore hidden dir
        if path.stem[0] == ".":
            continue

        # step in and recursive yield
        yield from find_leaf_dirs(path)

def test_process_files_in_leaf_dir():
    os.makedirs('tmp/t1/t2/t5', exist_ok=True)
    os.makedirs('tmp/t1/t3/.ts', exist_ok=True)
    os.makedirs('tmp/.ts/t4', exist_ok=True)
    Path('tmp/t1/t2/t5/test.json').touch()
    Path('tmp/t1/t3/test.json').touch()
    Path('tmp/t1/t3/.ts/test.json').touch()
    Path('tmp/.ts/t4/test.json').touch()
    leaves = list(find_leaf_dirs(Path('tmp')))
    assert process_files_in_leaf_dir(leaves) == [Path('tmp/t1/t2/t5') / 'test.json']
    shutil.rmtree('tmp')

context