Skip to content

Diffing

Comparison dataclass

Bases: ABC, Generic[_T]

A comparison between two objects of the same type.

Source code in splitgill/diffing.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
@dataclass
class Comparison(abc.ABC, Generic[_T]):
    """
    A comparison between two objects of the same type.
    """

    path: Tuple[Union[str, int], ...]
    left: _T
    right: _T

    @abc.abstractmethod
    def compare(self) -> Tuple[Optional[DiffOp], List['Comparison']]:
        """
        Compare the two objects and return a 2-tuple containing a DiffOp and a list of
        further comparisons which need to be handled. If no differences are found, then
        the first element of the returned 2-tuple will be None.

        :return: A 2-tuple containing a DiffOp and a list of further Comparison objects
        """
        pass

compare() abstractmethod

Compare the two objects and return a 2-tuple containing a DiffOp and a list of further comparisons which need to be handled. If no differences are found, then the first element of the returned 2-tuple will be None.

Returns:

Type Description
Tuple[Optional[DiffOp], List[Comparison]]

A 2-tuple containing a DiffOp and a list of further Comparison objects

Source code in splitgill/diffing.py
154
155
156
157
158
159
160
161
162
163
@abc.abstractmethod
def compare(self) -> Tuple[Optional[DiffOp], List['Comparison']]:
    """
    Compare the two objects and return a 2-tuple containing a DiffOp and a list of
    further comparisons which need to be handled. If no differences are found, then
    the first element of the returned 2-tuple will be None.

    :return: A 2-tuple containing a DiffOp and a list of further Comparison objects
    """
    pass

DictComparison dataclass

Bases: Comparison[dict]

A comparison between two dicts.

Source code in splitgill/diffing.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
class DictComparison(Comparison[dict]):
    """
    A comparison between two dicts.
    """

    def compare(self) -> Tuple[Optional[DiffOp], List['Comparison']]:
        """
        Compares the two dicts and return a 2-tuple containing a DiffOp and a list of
        further comparisons which need to be handled. If no differences are found, then
        the first element of the returned 2-tuple will be None.

        :return: A 2-tuple containing a DiffOp and a list of further Comparison objects
        """
        missing = object()
        ops = {}
        further_comparisons = []

        new_values = {
            key: value for key, value in self.right.items() if key not in self.left
        }
        if new_values:
            ops['dn'] = new_values

        deleted_keys = [key for key in self.left if key not in self.right]
        if deleted_keys:
            ops['dd'] = deleted_keys

        changes = {}
        for key, left_value in self.left.items():
            right_value = self.right.get(key, missing)

            # deletion or equality, nothing to do
            if right_value is missing or left_value == right_value:
                continue

            # check for nested container objects and add Comparison objects to the list
            # if any are found of the same types
            if isinstance(left_value, dict) and isinstance(right_value, dict):
                further_comparisons.append(
                    DictComparison((*self.path, key), left_value, right_value)
                )
            elif isinstance(left_value, list) and isinstance(right_value, list):
                further_comparisons.append(
                    ListComparison((*self.path, key), left_value, right_value)
                )
            else:
                changes[key] = right_value
        if changes:
            ops['dc'] = changes

        return DiffOp(self.path, ops) if ops else None, further_comparisons

compare()

Compares the two dicts and return a 2-tuple containing a DiffOp and a list of further comparisons which need to be handled. If no differences are found, then the first element of the returned 2-tuple will be None.

Returns:

Type Description
Tuple[Optional[DiffOp], List[Comparison]]

A 2-tuple containing a DiffOp and a list of further Comparison objects

Source code in splitgill/diffing.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def compare(self) -> Tuple[Optional[DiffOp], List['Comparison']]:
    """
    Compares the two dicts and return a 2-tuple containing a DiffOp and a list of
    further comparisons which need to be handled. If no differences are found, then
    the first element of the returned 2-tuple will be None.

    :return: A 2-tuple containing a DiffOp and a list of further Comparison objects
    """
    missing = object()
    ops = {}
    further_comparisons = []

    new_values = {
        key: value for key, value in self.right.items() if key not in self.left
    }
    if new_values:
        ops['dn'] = new_values

    deleted_keys = [key for key in self.left if key not in self.right]
    if deleted_keys:
        ops['dd'] = deleted_keys

    changes = {}
    for key, left_value in self.left.items():
        right_value = self.right.get(key, missing)

        # deletion or equality, nothing to do
        if right_value is missing or left_value == right_value:
            continue

        # check for nested container objects and add Comparison objects to the list
        # if any are found of the same types
        if isinstance(left_value, dict) and isinstance(right_value, dict):
            further_comparisons.append(
                DictComparison((*self.path, key), left_value, right_value)
            )
        elif isinstance(left_value, list) and isinstance(right_value, list):
            further_comparisons.append(
                ListComparison((*self.path, key), left_value, right_value)
            )
        else:
            changes[key] = right_value
    if changes:
        ops['dc'] = changes

    return DiffOp(self.path, ops) if ops else None, further_comparisons

DiffOp

Bases: NamedTuple

A namedtuple describing the differences found by the diff function.

Source code in splitgill/diffing.py
130
131
132
133
134
135
136
137
138
class DiffOp(NamedTuple):
    """
    A namedtuple describing the differences found by the diff function.
    """

    # the path where the changes should be applied at in the root dict
    path: Tuple[Union[str, int], ...]
    # a dict of the changes made at that path
    ops: Dict[str, Any]

DiffingTypeComparisonException

Bases: Exception

Exception raised if the base type and the new type passed to the diff function below are not both dicts.

Source code in splitgill/diffing.py
275
276
277
278
279
280
281
class DiffingTypeComparisonException(Exception):
    """
    Exception raised if the base type and the new type passed to the diff function below
    are not both dicts.
    """

    pass

ListComparison dataclass

Bases: Comparison[list]

A comparison between two lists.

Source code in splitgill/diffing.py
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
@dataclass
class ListComparison(Comparison[list]):
    """
    A comparison between two lists.
    """

    def compare(self) -> Tuple[DiffOp, List['Comparison']]:
        """
        Compares the two lists and return a 2-tuple containing a DiffOp and a list of
        further comparisons which need to be handled. If no differences are found, then
        the first element of the returned 2-tuple will be None.

        :return: A 2-tuple containing a DiffOp and a list of further Comparison objects
        """
        missing = object()
        ops = {}
        further_comparisons = []

        changes = []
        for index, (left_value, right_value) in enumerate(
            zip_longest(self.left, self.right, fillvalue=missing)
        ):
            if left_value == right_value:
                continue

            if left_value is missing:
                # the right list is longer, so store all the new values so that they can
                # just be added to the left list to patch it, and stop
                ops['ln'] = self.right[index:]
                break
            elif right_value is missing:
                # the left value is longer, so store the index from which elements in
                # the left list will be deleted to shorten it to the length of the right
                # list, and stop
                ops['ld'] = index
                break
            else:
                # a change in the values at this index in each list, check for nested
                # container objects and add Comparison objects to the list if any are
                # found of the same types
                if isinstance(left_value, dict) and isinstance(right_value, dict):
                    further_comparisons.append(
                        DictComparison((*self.path, index), left_value, right_value)
                    )
                elif isinstance(left_value, list) and isinstance(right_value, list):
                    further_comparisons.append(
                        ListComparison((*self.path, index), left_value, right_value)
                    )
                else:
                    changes.append((index, right_value))
        if changes:
            ops['lc'] = changes

        return DiffOp(self.path, ops) if ops else None, further_comparisons

compare()

Compares the two lists and return a 2-tuple containing a DiffOp and a list of further comparisons which need to be handled. If no differences are found, then the first element of the returned 2-tuple will be None.

Returns:

Type Description
Tuple[DiffOp, List[Comparison]]

A 2-tuple containing a DiffOp and a list of further Comparison objects

Source code in splitgill/diffing.py
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def compare(self) -> Tuple[DiffOp, List['Comparison']]:
    """
    Compares the two lists and return a 2-tuple containing a DiffOp and a list of
    further comparisons which need to be handled. If no differences are found, then
    the first element of the returned 2-tuple will be None.

    :return: A 2-tuple containing a DiffOp and a list of further Comparison objects
    """
    missing = object()
    ops = {}
    further_comparisons = []

    changes = []
    for index, (left_value, right_value) in enumerate(
        zip_longest(self.left, self.right, fillvalue=missing)
    ):
        if left_value == right_value:
            continue

        if left_value is missing:
            # the right list is longer, so store all the new values so that they can
            # just be added to the left list to patch it, and stop
            ops['ln'] = self.right[index:]
            break
        elif right_value is missing:
            # the left value is longer, so store the index from which elements in
            # the left list will be deleted to shorten it to the length of the right
            # list, and stop
            ops['ld'] = index
            break
        else:
            # a change in the values at this index in each list, check for nested
            # container objects and add Comparison objects to the list if any are
            # found of the same types
            if isinstance(left_value, dict) and isinstance(right_value, dict):
                further_comparisons.append(
                    DictComparison((*self.path, index), left_value, right_value)
                )
            elif isinstance(left_value, list) and isinstance(right_value, list):
                further_comparisons.append(
                    ListComparison((*self.path, index), left_value, right_value)
                )
            else:
                changes.append((index, right_value))
    if changes:
        ops['lc'] = changes

    return DiffOp(self.path, ops) if ops else None, further_comparisons

diff(base, new)

Finds the differences between the two dicts, yielding DiffOps. Each DiffOp describes specific differences between the base dict and the new dict. By applying them all using the patch function below, the new dict can be recreated from the base dict.

For efficiency, the DiffOps represent all the changes at a container level (e.g. a dict or list) not each specific change to every version at a specific key or index. This saves not only database space, but also allows for a faster patch function as changes can be applied en masse instead of individually.

Parameters:

Name Type Description Default
base dict

the base dict

required
new dict

the new version of the base dict

required

Returns:

Type Description
Iterable[DiffOp]

yields DiffOps (if any changes are found)

Source code in splitgill/diffing.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
def diff(base: dict, new: dict) -> Iterable[DiffOp]:
    """
    Finds the differences between the two dicts, yielding DiffOps. Each DiffOp describes
    specific differences between the base dict and the new dict. By applying them all
    using the patch function below, the new dict can be recreated from the base dict.

    For efficiency, the DiffOps represent all the changes at a container level (e.g. a
    dict or list) not each specific change to every version at a specific key or index.
    This saves not only database space, but also allows for a faster patch function as
    changes can be applied en masse instead of individually.

    :param base: the base dict
    :param new: the new version of the base dict
    :return: yields DiffOps (if any changes are found)
    """
    if base == new:
        return

    if not isinstance(base, dict) or not isinstance(new, dict):
        raise DiffingTypeComparisonException('Both base and new must be dicts')

    # todo: we could write a shortcut when one of the dicts is empty

    queue: Deque[Comparison] = deque([DictComparison(tuple(), base, new)])
    while queue:
        comparison: Comparison = queue.popleft()
        diff_op, further_comparisons = comparison.compare()
        if diff_op:
            yield diff_op
        if further_comparisons:
            queue.extend(further_comparisons)

patch(base, ops)

Applies the operations in the ops iterable to the base dict, returning a new dict. If there are no operations to apply, the base dict is returned unchanged.

Note that although the returned dict is new, the nested container values in it may or may not be new and could reference the same exact object as in the passed base dict. A nested container will be copied to avoid modifying the same container referenced in the base dict if there are any modifications made to it directly, or to any nested containers below it at any depth. If the nested container contains no changes to itself or its nested containers, it is not copied and the original reference to it from the base dict is used.

Parameters:

Name Type Description Default
base dict

the starting dict

required
ops Collection[DiffOp]

the DiffOps to apply to the base dict (can be pure tuples, doesn't have to be DiffOp namedtuples)

required

Returns:

Type Description
dict

a new dict with the changes applied

Source code in splitgill/diffing.py
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
def patch(base: dict, ops: Collection[DiffOp]) -> dict:
    """
    Applies the operations in the ops iterable to the base dict, returning a new dict.
    If there are no operations to apply, the base dict is returned unchanged.

    Note that although the returned dict is new, the nested container values in it may
    or may not be new and could reference the same exact object as in the passed base
    dict. A nested container will be copied to avoid modifying the same container
    referenced in the base dict if there are any modifications made to it directly, or
    to any nested containers below it at any depth. If the nested container contains no
    changes to itself or its nested containers, it is not copied and the original
    reference to it from the base dict is used.

    :param base: the starting dict
    :param ops: the DiffOps to apply to the base dict (can be pure tuples, doesn't have
        to be DiffOp namedtuples)
    :return: a new dict with the changes applied
    """
    # nothing to do
    if len(ops) == 0:
        return base

    # create a copy of the base dict so that we don't modify it and can return a new one
    new = base.copy()

    for path, op in ops:
        # loop through the path finding the target of the operations we're going to
        # perform. At every point in the path, replace the container with a copy to
        # ensure we don't modify the container object from the base.
        target = new
        for key_or_index in path:
            target_copy = target[key_or_index].copy()
            target[key_or_index] = target_copy
            target = target_copy

        # dict ops
        if 'dc' in op:
            target.update(op['dc'])
        if 'dn' in op:
            target.update(op['dn'])
        if 'dd' in op:
            for key in op['dd']:
                del target[key]

        # list ops
        if 'lc' in op:
            for index, value in op['lc']:
                target[index] = value
        if 'ln' in op:
            target.extend(op['ln'])
        if 'ld' in op:
            del target[op['ld'] :]

    return new

prepare_data(value)

Prepares the given value for storage in MongoDB. Conversions are completed like so:

    - None values are just returned as is
    - str values have invalid characters removed and are then returned. The
      characters are currently all unicode control characters except

, , and . - int, float, bool, and None values are returned with no changes made - datetime and date values are converted to strings using strftime with the specific formats DATETIME_FORMAT and DATE_FORMAT. - dict values are returned as a new dict instance, with all the keys converted to strings and all the values recursively prepared using this function. - lists, sets, and tuples are converted to lists with each element of the value prepared by this function.

:param value: the value to be stored in MongoDB
:return: None, str, int, float, bool, tuple, or dict depending on the input value
Source code in splitgill/diffing.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def prepare_data(
    value: Any,
) -> Union[str, dict, list, int, float, bool, None]:
    """
    Prepares the given value for storage in MongoDB. Conversions are completed like so:

        - None values are just returned as is
        - str values have invalid characters removed and are then returned. The
          characters are currently all unicode control characters except \n, \r, and \t.
        - int, float, bool, and None values are returned with no changes made
        - datetime and date values are converted to strings using strftime with the
          specific formats DATETIME_FORMAT and DATE_FORMAT.
        - dict values are returned as a new dict instance, with all the keys converted
          to strings and all the values recursively prepared using this function.
        - lists, sets, and tuples are converted to lists with each element of the value
          prepared by this function.

    :param value: the value to be stored in MongoDB
    :return: None, str, int, float, bool, tuple, or dict depending on the input value
    """
    if value is None:
        return None

    if isinstance(value, str):
        # replace any invalid characters in the string with the empty string
        return invalid_value_char_regex.sub('', value)

    if isinstance(value, (int, float, bool)):
        return value

    if isinstance(value, dict):
        return {
            prepare_field_name(key): prepare_data(value) for key, value in value.items()
        }

    if isinstance(value, (list, set, tuple)):
        return list(map(prepare_data, value))

    # check datetime first as datetime is a subclass of date
    if isinstance(value, datetime):
        # stringifying this ensures the tz info is recorded and won't change going
        # in/out mongo
        return value.strftime(DATETIME_FORMAT)

    # now check date as we've covered off datetimes
    if isinstance(value, date):
        # stringify to simplify handling of dates to mirror datetime pattern even though
        # the same timezone issue doesn't exist here
        return value.strftime(DATE_FORMAT)

    # fallback
    return str(value)

prepare_field_name(name)

Cleans up a field name for ingestion into the system. There are a few steps to this:

- convert the name to a str as MongoDB only accepts str keys in objects
- remove any control characters from the str
- replace . with _ as Elasticsearch doesn't like dots in keys
- replace any name starting _ with - as _ is a reserved character in Splitgill
  that we use in special cases

If after cleaning, the field name is an empty string, we return a hyphen.

This function explicitly handles the _id field by just returning it if encountered.

Parameters:

Name Type Description Default
name Any

the field name

required

Returns:

Type Description
str

a clean str field name

Source code in splitgill/diffing.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def prepare_field_name(name: Any) -> str:
    """
    Cleans up a field name for ingestion into the system. There are a few steps to this:

        - convert the name to a str as MongoDB only accepts str keys in objects
        - remove any control characters from the str
        - replace . with _ as Elasticsearch doesn't like dots in keys
        - replace any name starting _ with - as _ is a reserved character in Splitgill
          that we use in special cases

    If after cleaning, the field name is an empty string, we return a hyphen.

    This function explicitly handles the _id field by just returning it if encountered.

    :param name: the field name
    :return: a clean str field name
    """
    clean_name = invalid_key_char_regex.sub('', str(name)).replace('.', '_').strip()
    # if this results in the empty string, replace with a hyphen
    if not clean_name:
        return '-'

    if name == DATA_ID_FIELD:
        return DATA_ID_FIELD

    if clean_name[0] == '_':
        clean_name = f'-{clean_name[1:]}'
    return clean_name