Skip to content

aoi_grouping

AOI grouping step logic.

kelp.data_prep.aoi_grouping.AOIGroupingConfig

Bases: ConfigBase

AOI grouping configuration

Source code in kelp/data_prep/aoi_grouping.py
27
28
29
30
31
32
33
34
35
class AOIGroupingConfig(ConfigBase):
    """AOI grouping configuration"""

    dem_dir: Path
    metadata_fp: Path
    output_dir: Path
    batch_size: int = 32
    num_workers: int = 6
    similarity_threshold: float = 0.95

kelp.data_prep.aoi_grouping.ImageDataset

Bases: Dataset

A simple image dataset that loads images from a list of file paths.

Parameters:

Name Type Description Default
fps List[Path]

The file paths.

required
transform Callable[[Any], Tensor]

Transform to apply to the input images.

required
Source code in kelp/data_prep/aoi_grouping.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class ImageDataset(Dataset):
    """
    A simple image dataset that loads images from a list of file paths.

    Args:
        fps: The file paths.
        transform: Transform to apply to the input images.
    """

    def __init__(self, fps: List[Path], transform: Callable[[Any], Tensor]) -> None:
        self.fps = fps
        self.transform = transform

    def __getitem__(self, idx: int) -> Tensor:
        """
        Get image by its index.

        Args:
            idx: The image index.

        Returns: A tensor with transformed image

        """
        with open(self.fps[idx], "rb") as f:
            img = Image.open(f)
            sample = img.convert("RGB")
        sample = self.transform(sample)
        return sample

    def __len__(self) -> int:
        """
        Get the number of images in the dataset.

        Returns: The number of images in the dataset.

        """
        return len(self.fps)

kelp.data_prep.aoi_grouping.calculate_similarity_groups

Calculate similarity groups.

Parameters:

Name Type Description Default
dataset ImageDataset

An instance of ImageDataset class.

required
features ndarray

The embeddings for all images.

required
threshold float

The similarity threshold to use when comparing individual image pairs.

0.95
Source code in kelp/data_prep/aoi_grouping.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
@timed
def calculate_similarity_groups(
    dataset: ImageDataset,
    features: np.ndarray,  # type: ignore[type-arg]
    threshold: float = 0.95,
) -> List[List[str]]:
    """
    Calculate similarity groups.

    Args:
        dataset: An instance of ImageDataset class.
        features: The embeddings for all images.
        threshold: The similarity threshold to use when comparing individual image pairs.

    Returns: A list of similar images.

    """
    similarity_matrix = cosine_similarity(features)
    groups = []
    for i in tqdm(range(len(similarity_matrix)), desc="Grouping similar images", total=len(similarity_matrix)):
        similar_images = []
        for j in range(len(similarity_matrix[i])):
            if i != j and similarity_matrix[i][j] >= threshold:
                similar_images.append(dataset.fps[j].stem.split("_")[0])  # Add image path
        if similar_images:
            similar_images.append(dataset.fps[i].stem.split("_")[0])
            similar_images = sorted(similar_images)
            if similar_images in groups:
                continue
            groups.append(similar_images)
        else:
            groups.append([dataset.fps[i].stem.split("_")[0]])
    return groups

kelp.data_prep.aoi_grouping.explode_groups_if_needed

Explodes all groups if needed.

Parameters:

Name Type Description Default
groups List[List[str]]

The list of groups of similar images to explode.

required
Source code in kelp/data_prep/aoi_grouping.py
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
@timed
def explode_groups_if_needed(groups: List[List[str]]) -> List[List[str]]:
    """
    Explodes all groups if needed.

    Args:
        groups: The list of groups of similar images to explode.

    Returns: A list of groups of similar images.

    """

    final_groups = []
    for group in groups:
        if len(group) > IMAGES_PER_GROUP_EXPLODE_THRESHOLD:
            final_groups.extend([[tile_id] for tile_id in group])
            continue
        final_groups.append(group)
    return final_groups

kelp.data_prep.aoi_grouping.find_similar_images

Finds similar images in specified data folder.

Parameters:

Name Type Description Default
data_folder Path

The data folder with input images.

required
tile_ids List[str]

A list of Tile IDs for corresponding images.

required
threshold float

The similarity threshold to use when comparing individual image pairs.

0.95
batch_size int

Batch size to use when generating embeddings.

32
num_workers int

Number of worker processes to use when generating embeddings.

6
Source code in kelp/data_prep/aoi_grouping.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
@timed
def find_similar_images(
    data_folder: Path,
    tile_ids: List[str],
    threshold: float = 0.95,
    batch_size: int = 32,
    num_workers: int = 6,
) -> List[List[str]]:
    """
    Finds similar images in specified data folder.

    Args:
        data_folder: The data folder with input images.
        tile_ids: A list of Tile IDs for corresponding images.
        threshold: The similarity threshold to use when comparing individual image pairs.
        batch_size: Batch size to use when generating embeddings.
        num_workers: Number of worker processes to use when generating embeddings.

    Returns: A list of similar images.

    """
    features, dataset = generate_embeddings(
        data_folder=data_folder,
        tile_ids=tile_ids,
        batch_size=batch_size,
        num_workers=num_workers,
    )
    groups = calculate_similarity_groups(
        dataset=dataset,
        features=features,
        threshold=threshold,
    )
    return groups

kelp.data_prep.aoi_grouping.generate_embeddings

Generates embeddings for images in specified data folder.

Parameters:

Name Type Description Default
data_folder Path

A path to the data folder.

required
tile_ids List[str]

A list of Tile IDs for corresponding images.

required
batch_size int

Batch size to use when generating embeddings.

32
num_workers int

Number of worker processes to use when generating embeddings

6
Source code in kelp/data_prep/aoi_grouping.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
@timed
def generate_embeddings(
    data_folder: Path,
    tile_ids: List[str],
    batch_size: int = 32,
    num_workers: int = 6,
) -> Tuple[np.ndarray, ImageDataset]:  # type: ignore[type-arg]
    """
    Generates embeddings for images in specified data folder.

    Args:
        data_folder: A path to the data folder.
        tile_ids: A list of Tile IDs for corresponding images.
        batch_size: Batch size to use when generating embeddings.
        num_workers: Number of worker processes to use when generating embeddings

    Returns: A tuple of array with embeddings for each tile and an Image Dataset instance.

    """
    fps = sorted([data_folder / f"{tile_id}_dem.png" for tile_id in tile_ids])
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
    model.eval()
    model.to(device)

    transform = transforms.Compose(
        [
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ]
    )

    dataset = ImageDataset(fps=fps, transform=transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)

    features = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Calculating embeddings"):
            outputs: Tensor = model(batch.to(device))
            features.append(outputs.detach().cpu())

    features_arr = torch.cat(features, dim=0).numpy()

    return features_arr, dataset

kelp.data_prep.aoi_grouping.group_aoi

Groups images in the specified DEM directory into similar AOIs.

Parameters:

Name Type Description Default
dem_dir Path

The path to the directory with DEM images.

required
metadata_fp Path

The path to the metadata CSV file.

required
output_dir Path

The path where to save the results.

required
batch_size int

Batch size to use when generating embeddings.

32
num_workers int

Number of worker processes to use when generating embeddings.

6
similarity_threshold float

The similarity threshold to use when comparing individual image pairs.

0.95
Source code in kelp/data_prep/aoi_grouping.py
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
@timed
def group_aoi(
    dem_dir: Path,
    metadata_fp: Path,
    output_dir: Path,
    batch_size: int = 32,
    num_workers: int = 6,
    similarity_threshold: float = 0.95,
) -> None:
    """
    Groups images in the specified DEM directory into similar AOIs.

    Args:
        dem_dir: The path to the directory with DEM images.
        metadata_fp: The path to the metadata CSV file.
        output_dir: The path where to save the results.
        batch_size: Batch size to use when generating embeddings.
        num_workers: Number of worker processes to use when generating embeddings.
        similarity_threshold: The similarity threshold to use when comparing individual image pairs.

    """
    metadata = pd.read_csv(metadata_fp)
    metadata["split"] = metadata["in_train"].apply(lambda x: "train" if x else "test")
    training_tiles = metadata[metadata["split"] == consts.data.TRAIN]["tile_id"].tolist()
    groups = find_similar_images(
        data_folder=dem_dir,
        tile_ids=training_tiles,
        threshold=similarity_threshold,
        batch_size=batch_size,
        num_workers=num_workers,
    )
    save_json(output_dir / f"intermediate_image_groups_{similarity_threshold=}.json", groups)
    merged_groups = group_duplicate_images(groups=groups)
    save_json(output_dir / f"merged_image_groups_{similarity_threshold=}.json", merged_groups)
    final_groups = explode_groups_if_needed(groups=merged_groups)
    save_json(output_dir / f"final_image_groups_{similarity_threshold=}.json", final_groups)
    groups_df = groups_to_dataframe(final_groups)
    (
        metadata.merge(
            groups_df,
            left_on="tile_id",
            right_on="tile_id",
            how="left",
        ).to_parquet(output_dir / f"metadata_{similarity_threshold=}.parquet", index=False)
    )

kelp.data_prep.aoi_grouping.group_duplicate_images

Groups duplicate (similar) images.

Parameters:

Name Type Description Default
groups List[List[str]]

A list of lists where inner items are similar images.

required
Source code in kelp/data_prep/aoi_grouping.py
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
@timed
def group_duplicate_images(groups: List[List[str]]) -> List[List[str]]:
    """
    Groups duplicate (similar) images.

    Args:
        groups: A list of lists where inner items are similar images.

    Returns: Deduplicated list of the grouped images.

    """

    # Step 1: Flatten the list of lists
    flattened_list = [tile_id for similar_image_group in groups for tile_id in similar_image_group]

    # Step 2: Create a map of image IDs to groups
    id_to_group: Dict[str, int] = {}
    group_to_ids: Dict[int, set[str]] = defaultdict(set)

    # Step 3: Iterate through the image IDs
    for tile_id in flattened_list:
        if tile_id in id_to_group:
            # Already assigned to a group, continue
            continue

        # Check for duplicates in other lists
        assigned_group = None
        for sublist in groups:
            if tile_id in sublist:
                # Check if any other ID in this sublist has been assigned a group
                for other_id in sublist:
                    if other_id in id_to_group:
                        assigned_group = id_to_group[other_id]
                        break
                if assigned_group is not None:
                    break

        # Step 4: Assign groups to image IDs
        if assigned_group is None:
            # Create a new group
            assigned_group = len(group_to_ids) + 1

        id_to_group[tile_id] = assigned_group
        group_to_ids[assigned_group].add(tile_id)

    # Step 5: Group the IDs
    final_groups = [list(group) for group in list(group_to_ids.values())]

    return final_groups

kelp.data_prep.aoi_grouping.groups_to_dataframe

Creates a DataFrame with groups of similar images.

Parameters:

Name Type Description Default
groups List[List[str]]

A list of groups of similar images.

required
Source code in kelp/data_prep/aoi_grouping.py
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
@timed
def groups_to_dataframe(groups: List[List[str]]) -> pd.DataFrame:
    """
    Creates a DataFrame with groups of similar images.

    Args:
        groups: A list of groups of similar images.

    Returns: A DataFrame with groups of similar images.

    """
    records = []
    for idx, group in enumerate(groups):
        for tile_id in group:
            records.append((tile_id, idx))
    return pd.DataFrame(records, columns=["tile_id", "aoi_id"])

kelp.data_prep.aoi_grouping.main

Main entrypoint for grouping similar tiles together into AOIs

Source code in kelp/data_prep/aoi_grouping.py
391
392
393
394
395
396
397
398
399
400
401
def main() -> None:
    """Main entrypoint for grouping similar tiles together into AOIs"""
    cfg = parse_args()
    group_aoi(
        dem_dir=cfg.dem_dir,
        metadata_fp=cfg.metadata_fp,
        output_dir=cfg.output_dir,
        batch_size=cfg.batch_size,
        num_workers=cfg.num_workers,
        similarity_threshold=cfg.similarity_threshold,
    )

kelp.data_prep.aoi_grouping.parse_args

Parse command line arguments.

Returns: An instance of AOI Grouping Config.

Source code in kelp/data_prep/aoi_grouping.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def parse_args() -> AOIGroupingConfig:
    """
    Parse command line arguments.

    Returns: An instance of AOI Grouping Config.

    """
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--dem_dir",
        type=str,
        required=True,
    )
    parser.add_argument(
        "--metadata_fp",
        type=str,
        required=True,
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        required=True,
    )
    parser.add_argument(
        "--batch_size",
        type=int,
        default=32,
    )
    parser.add_argument(
        "--num_workers",
        type=int,
        default=6,
    )
    parser.add_argument(
        "--similarity_threshold",
        type=float,
        default=0.8,
    )
    args = parser.parse_args()
    cfg = AOIGroupingConfig(**vars(args))
    cfg.log_self()
    cfg.output_dir.mkdir(exist_ok=True, parents=True)
    return cfg

kelp.data_prep.aoi_grouping.save_json

Saves data to specified JSON file.

Parameters:

Name Type Description Default
fp Path

The path to JSON file.

required
data Any

The content to save as JSON.

required
Source code in kelp/data_prep/aoi_grouping.py
312
313
314
315
316
317
318
319
320
321
322
323
@timed
def save_json(fp: Path, data: Any) -> None:
    """
    Saves data to specified JSON file.

    Args:
        fp: The path to JSON file.
        data: The content to save as JSON.

    """
    with open(fp, "w") as file:
        json.dump(data, file, indent=4)