Skip to content

Model

GeoFieldHint dataclass

Class holding the fields representing the fields in a record which describe its latitude/longitude location and an optional uncertainty radius.

Source code in splitgill/model.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
@dataclass(frozen=True)
class GeoFieldHint:
    """
    Class holding the fields representing the fields in a record which describe its
    latitude/longitude location and an optional uncertainty radius.
    """

    lat_field: str
    lon_field: str
    radius_field: Optional[str] = None
    # the number of segments to use to create a circle around a point when a radius is
    # provided in the geo hint. Circles can't be directly represented in WKT nor
    # GeoJSON, so we have to build a polygon instead that looks like a circle using
    # triangles. This setting configures the number of segments to use to make the
    # circle, the higher this number the more accurate the polygon's representation of
    # the circle, but the more complex the shape. Defaults to 16 which produces 64 (+1
    # for the repeat start/end) coordinates in the resulting polygon. This should be
    # enough for the majority of uses.
    segments: int = 16

    def __eq__(self, other: Any) -> bool:
        if isinstance(other, GeoFieldHint):
            return self.lat_field == other.lat_field
        raise NotImplemented

    def __hash__(self) -> int:
        return hash(self.lat_field)

IngestResult dataclass

A dataclass containing information about the new data ingested into MongoDB.

Source code in splitgill/model.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
@dataclass
class IngestResult:
    """
    A dataclass containing information about the new data ingested into MongoDB.
    """

    # the version the new data was added at (if the data was not committed or no new
    # data was added, then this will be None)
    version: Optional[int] = None
    # the number of insert operations performed
    inserted: int = 0
    # the number of update operations performed
    updated: int = 0
    # the number of delete operations performed
    deleted: int = 0

    @property
    def was_committed(self) -> bool:
        """
        Returns True if the data was committed, False if not. This is determined by
        whether a version is available.

        :return: True if the data was committed, False if not
        """
        return self.version is not None

    def update(self, bulk_result: BulkWriteResult):
        """
        Update the counts with the counts in the bulk result object.

        :param bulk_result: a BulkWriteResult object
        """
        self.inserted += bulk_result.inserted_count
        self.updated += bulk_result.modified_count
        self.deleted += bulk_result.deleted_count

was_committed property

Returns True if the data was committed, False if not. This is determined by whether a version is available.

Returns:

Type Description
bool

True if the data was committed, False if not

update(bulk_result)

Update the counts with the counts in the bulk result object.

Parameters:

Name Type Description Default
bulk_result BulkWriteResult

a BulkWriteResult object

required
Source code in splitgill/model.py
220
221
222
223
224
225
226
227
228
def update(self, bulk_result: BulkWriteResult):
    """
    Update the counts with the counts in the bulk result object.

    :param bulk_result: a BulkWriteResult object
    """
    self.inserted += bulk_result.inserted_count
    self.updated += bulk_result.modified_count
    self.deleted += bulk_result.deleted_count

MongoRecord dataclass

A record retrieved from MongoDB.

Source code in splitgill/model.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
@dataclass
class MongoRecord:
    """
    A record retrieved from MongoDB.
    """

    _id: ObjectId
    id: str
    version: Optional[int]
    data: dict
    # you'd expect the keys to be ints but MongoDB doesn't allow non-string keys
    diffs: Dict[str, List[DiffOp]] = field(default_factory=dict)

    @property
    def is_deleted(self) -> bool:
        """
        A record is deleted if its current data is an empty dict.

        :return: True if this record has been deleted, False if not
        """
        return not self.data

    @property
    def is_uncommitted(self) -> bool:
        """
        A record is uncommitted if its current version is None.

        :return: True if this record has been deleted, False if not
        """
        return self.version is None

    @property
    def has_history(self) -> bool:
        """
        A record has history if it has any diffs.

        :return: True if this record has previous versions, False if not
        """
        return bool(self.diffs)

    def get_versions(self, desc=False) -> List[int]:
        """
        Returns a list of the record's versions in ascending order. If desc is True, the
        versions are returned in descending order. If the current version is None, it is
        not included.

        :return: the record's versions
        """
        versions = map(int, self.diffs)
        if self.version is not None:
            versions = chain(versions, (self.version,))
        return sorted(versions, reverse=desc)

    def iter(self) -> Iterable[VersionedData]:
        """
        Yields the versions and data of this record. These are yielded as (int, dict)
        VersionedData named tuples. The tuples are yielded in reverse order, starting
        with the latest data and working back to the first version.

        :return: VersionedData (version: int, data: dict) named tuples in descending
            version order
        """
        yield VersionedData(self.version, self.data)
        base = self.data
        for version in sorted(map(int, self.diffs), reverse=True):
            data = patch(base, self.diffs[str(version)])
            # convert the string versions to ints on the way out the door
            yield VersionedData(version, data)
            base = data

has_history property

A record has history if it has any diffs.

Returns:

Type Description
bool

True if this record has previous versions, False if not

is_deleted property

A record is deleted if its current data is an empty dict.

Returns:

Type Description
bool

True if this record has been deleted, False if not

is_uncommitted property

A record is uncommitted if its current version is None.

Returns:

Type Description
bool

True if this record has been deleted, False if not

get_versions(desc=False)

Returns a list of the record's versions in ascending order. If desc is True, the versions are returned in descending order. If the current version is None, it is not included.

Returns:

Type Description
List[int]

the record's versions

Source code in splitgill/model.py
83
84
85
86
87
88
89
90
91
92
93
94
def get_versions(self, desc=False) -> List[int]:
    """
    Returns a list of the record's versions in ascending order. If desc is True, the
    versions are returned in descending order. If the current version is None, it is
    not included.

    :return: the record's versions
    """
    versions = map(int, self.diffs)
    if self.version is not None:
        versions = chain(versions, (self.version,))
    return sorted(versions, reverse=desc)

iter()

Yields the versions and data of this record. These are yielded as (int, dict) VersionedData named tuples. The tuples are yielded in reverse order, starting with the latest data and working back to the first version.

Returns:

Type Description
Iterable[VersionedData]

VersionedData (version: int, data: dict) named tuples in descending version order

Source code in splitgill/model.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def iter(self) -> Iterable[VersionedData]:
    """
    Yields the versions and data of this record. These are yielded as (int, dict)
    VersionedData named tuples. The tuples are yielded in reverse order, starting
    with the latest data and working back to the first version.

    :return: VersionedData (version: int, data: dict) named tuples in descending
        version order
    """
    yield VersionedData(self.version, self.data)
    base = self.data
    for version in sorted(map(int, self.diffs), reverse=True):
        data = patch(base, self.diffs[str(version)])
        # convert the string versions to ints on the way out the door
        yield VersionedData(version, data)
        base = data

ParsingOptions dataclass

Holds options for parsing.

The objects created using this class are immutable. You can instantiate them directly, but it's better to use The ParsingOptionBuilder defined below.

Source code in splitgill/model.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
@dataclass(frozen=True)
class ParsingOptions:
    """
    Holds options for parsing.

    The objects created using this class are immutable. You can instantiate them
    directly, but it's better to use The ParsingOptionBuilder defined below.
    """

    # lowercase string values which should be parsed as True
    true_values: FrozenSet[str]
    # lowercase string values which should be parsed as False
    false_values: FrozenSet[str]
    # date format strings to test candidates against using datetime.strptime
    date_formats: FrozenSet[str]
    # GeoFieldHint objects which can be used to test if a record contains any geographic
    # coordinate data
    geo_hints: FrozenSet[GeoFieldHint]
    # the maximum length of keyword strings. Strings will be truncated to this length
    # before indexing
    keyword_length: int
    # the format to use to convert a float to a string for indexing. The string will
    # have format() called on it with the float value passed as the only parameter,
    # therefore the format string should use 0 to reference it
    float_format: str

    def to_doc(self) -> dict:
        return {
            'true_values': list(self.true_values),
            'false_values': list(self.false_values),
            'date_formats': list(self.date_formats),
            'geo_hints': [astuple(hint) for hint in self.geo_hints],
            'keyword_length': self.keyword_length,
            'float_format': self.float_format,
        }

    @classmethod
    def from_doc(cls, doc: dict) -> 'ParsingOptions':
        return ParsingOptions(
            frozenset(doc['true_values']),
            frozenset(doc['false_values']),
            frozenset(doc['date_formats']),
            frozenset(GeoFieldHint(*params) for params in doc['geo_hints']),
            doc['keyword_length'],
            doc['float_format'],
        )

Record dataclass

A record before it becomes managed by Splitgill.

Source code in splitgill/model.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
@dataclass
class Record:
    """
    A record before it becomes managed by Splitgill.
    """

    id: str
    data: dict

    @property
    def is_delete(self) -> bool:
        """
        Returns True if this record is a delete request, otherwise False. A delete
        request is a record with empty data ({}).

        :return: True if this is a delete, False if not
        """
        return not self.data

    @staticmethod
    def new(data: dict) -> 'Record':
        return Record(str(uuid4()), data)

    @staticmethod
    def delete(record_id: str) -> 'Record':
        return Record(record_id, {})

is_delete property

Returns True if this record is a delete request, otherwise False. A delete request is a record with empty data ({}).

Returns:

Type Description
bool

True if this is a delete, False if not