Skip to content

Fields

DataField dataclass

Class representing a field in the original record data structure.

Source code in splitgill/indexing/fields.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
@dataclasses.dataclass
class DataField:
    """
    Class representing a field in the original record data structure.
    """

    # can include empty fields which are used to indicate list elements
    path: str
    # the total number of records which have a field with this path
    count: int = 0
    # the total number of records which have this field represented with a value of the
    # given types
    type_counts: CounterType[DataType] = dataclasses.field(default_factory=Counter)
    # the parent data field (if None, this is a field at the root of the record data)
    parent: Optional['DataField'] = None
    # the immediate descendants of this field (will only have values if this field
    # appears as a list or dict
    children: List['DataField'] = dataclasses.field(default_factory=list)

    def add(self, type_names: str, count: int):
        """
        Add the given type count data to this field.

        :param type_names: the types this field is seen as a string of their names
            separated by commas.
        :param count: the number of records with this combination of types
        """
        self.count += count
        for name in type_names.split(','):
            self.type_counts[DataType(name)] += count

    def is_type(self, *data_types: DataType) -> bool:
        """
        Checks if this field is an instance of one of the given data types.

        :param data_types: the data types to be checked
        :return: True if the field is an instance of one of the given data types, False
            if not
        """
        return any(self.type_counts[data_type] > 0 for data_type in data_types)

    @property
    def has_children(self) -> bool:
        return len(self.children) > 0

    @property
    def name(self) -> str:
        return self.path.split('.')[-1]

    @property
    def is_none(self) -> bool:
        return self.type_counts[DataType.NONE] > 0

    @property
    def count_none(self) -> int:
        return self.type_counts[DataType.NONE]

    @property
    def is_str(self) -> bool:
        return self.type_counts[DataType.STR] > 0

    @property
    def count_str(self) -> int:
        return self.type_counts[DataType.STR]

    @property
    def is_int(self) -> bool:
        return self.type_counts[DataType.INT] > 0

    @property
    def count_int(self) -> int:
        return self.type_counts[DataType.INT]

    @property
    def is_float(self) -> bool:
        return self.type_counts[DataType.FLOAT] > 0

    @property
    def count_float(self) -> int:
        return self.type_counts[DataType.FLOAT]

    @property
    def is_bool(self) -> bool:
        return self.type_counts[DataType.BOOL] > 0

    @property
    def count_bool(self) -> int:
        return self.type_counts[DataType.BOOL]

    @property
    def is_list(self) -> bool:
        return self.type_counts[DataType.LIST] > 0

    @property
    def count_list(self) -> int:
        return self.type_counts[DataType.LIST]

    @property
    def is_dict(self) -> bool:
        return self.type_counts[DataType.DICT] > 0

    @property
    def count_dict(self) -> int:
        return self.type_counts[DataType.DICT]

    @property
    def is_basic(self) -> bool:
        return self.is_type(
            DataType.BOOL, DataType.INT, DataType.FLOAT, DataType.STR, DataType.NONE
        )

    @property
    def is_container(self) -> bool:
        return self.is_type(DataType.LIST, DataType.DICT)

    @property
    def is_root_field(self) -> bool:
        return self.parent is None

    @property
    def parsed_path(self) -> str:
        """
        Returns the equivalent parsed path for this data field.

        :return: a str path
        """
        return '.'.join(filter(None, self.path.split('.')))

    @property
    def is_list_element(self) -> bool:
        return self.name == ''

parsed_path property

Returns the equivalent parsed path for this data field.

Returns:

Type Description
str

a str path

add(type_names, count)

Add the given type count data to this field.

Parameters:

Name Type Description Default
type_names str

the types this field is seen as a string of their names separated by commas.

required
count int

the number of records with this combination of types

required
Source code in splitgill/indexing/fields.py
164
165
166
167
168
169
170
171
172
173
174
def add(self, type_names: str, count: int):
    """
    Add the given type count data to this field.

    :param type_names: the types this field is seen as a string of their names
        separated by commas.
    :param count: the number of records with this combination of types
    """
    self.count += count
    for name in type_names.split(','):
        self.type_counts[DataType(name)] += count

is_type(*data_types)

Checks if this field is an instance of one of the given data types.

Parameters:

Name Type Description Default
data_types DataType

the data types to be checked

()

Returns:

Type Description
bool

True if the field is an instance of one of the given data types, False if not

Source code in splitgill/indexing/fields.py
176
177
178
179
180
181
182
183
184
def is_type(self, *data_types: DataType) -> bool:
    """
    Checks if this field is an instance of one of the given data types.

    :param data_types: the data types to be checked
    :return: True if the field is an instance of one of the given data types, False
        if not
    """
    return any(self.type_counts[data_type] > 0 for data_type in data_types)

DataType

Bases: LowercaseStrEnum

Enum representing the types of data Splitgill indexes as user data.

The types represented here should match the output of diffing.prepare_data.

Source code in splitgill/indexing/fields.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class DataType(LowercaseStrEnum):
    """
    Enum representing the types of data Splitgill indexes as user data.

    The types represented here should match the output of diffing.prepare_data.
    """

    NONE = '#n'
    STR = '#s'
    INT = '#i'
    FLOAT = '#f'
    BOOL = '#b'
    LIST = '#l'
    DICT = '#d'

    @classmethod
    def type_for(cls, value: Union[str, int, float, bool, dict, list, None]):
        """
        Given a value, return the DataType enum for it. If the value's type isn't one we
        support, a TypeError is thrown.

        :param value: value to get the type for
        :return: a DataType
        """
        if value is not None and not isinstance(value, valid_data_types):
            raise TypeError(
                f'Type ({type(value)}) of value ({value}) not valid DataType'
            )
        return DataType(f'#{type(value).__name__[0].lower()}')

type_for(value) classmethod

Given a value, return the DataType enum for it. If the value's type isn't one we support, a TypeError is thrown.

Parameters:

Name Type Description Default
value Union[str, int, float, bool, dict, list, None]

value to get the type for

required

Returns:

Type Description

a DataType

Source code in splitgill/indexing/fields.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
@classmethod
def type_for(cls, value: Union[str, int, float, bool, dict, list, None]):
    """
    Given a value, return the DataType enum for it. If the value's type isn't one we
    support, a TypeError is thrown.

    :param value: value to get the type for
    :return: a DataType
    """
    if value is not None and not isinstance(value, valid_data_types):
        raise TypeError(
            f'Type ({type(value)}) of value ({value}) not valid DataType'
        )
    return DataType(f'#{type(value).__name__[0].lower()}')

DocumentField

Bases: LowercaseStrEnum

Enum representing the fields used in the indexed documents.

Source code in splitgill/indexing/fields.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class DocumentField(LowercaseStrEnum):
    """
    Enum representing the fields used in the indexed documents.
    """

    # the record ID
    ID = auto()
    # the version of this record
    VERSION = auto()
    # the next version of this record (if available)
    NEXT = auto()
    # the range of versions this record is valid for. The lower bound is the same value
    # as the version field and the upper bound is the same value as the next field
    VERSIONS = auto()
    # the record's data parsed for indexing
    DATA = auto()
    # type information about the fields found in the data
    DATA_TYPES = auto()
    # type information about the fields found in the parsed data
    PARSED_TYPES = auto()
    # a text field into which all data is added to support "search everything" searches
    ALL_TEXT = auto()
    # ALL_POINTS and ALL_SHAPES are geo fields into which all geo data is added to
    # support "search everything" geo searches. ALL_SHAPES gets all data from GEO_SHAPE
    # parsed fields and ALL_POINTS gets all data from GEO_POINT parsed fields. If you're
    # doing a search, you probably want to use the ALL_SHAPES field but if you're
    # mapping the results, you'll need to aggregate on the ALL_POINTS value as you
    # aren't allowed to use geo aggregations on the geo shape data type unless you used
    # a paid Elasticsearch version (yes this is annoying).
    ALL_POINTS = auto()
    ALL_SHAPES = auto()

ParsedField dataclass

Class representing a field in the parsed record data structure.

Source code in splitgill/indexing/fields.py
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
@dataclasses.dataclass
class ParsedField:
    """
    Class representing a field in the parsed record data structure.
    """

    path: str
    count: int = 0
    type_counts: CounterType[ParsedType] = dataclasses.field(default_factory=Counter)

    def add(self, type_names: str, count: int):
        """
        Add the given type count data to this field.

        :param type_names: the types this field is seen as a string of their names
            separated by commas.
        :param count: the number of records with this combination of types
        """
        self.count += count
        for raw_type in type_names.split(','):
            self.type_counts[ParsedType(raw_type)] += count

    def is_type(self, *parsed_types: ParsedType) -> bool:
        """
        Checks if this field is an instance of one of the given parsed types.

        :param parsed_types: the parsed types to be checked
        :return: True if the field is an instance of one of the given parsed types,
            False if not
        """
        return any(self.type_counts[parsed_type] > 0 for parsed_type in parsed_types)

    @property
    def name(self) -> str:
        return self.path.split('.')[-1]

    @property
    def is_text(self) -> bool:
        return self.type_counts[ParsedType.TEXT] > 0

    @property
    def count_text(self) -> int:
        return self.type_counts[ParsedType.TEXT]

    @property
    def is_keyword(self) -> bool:
        return self.type_counts[ParsedType.KEYWORD] > 0

    @property
    def count_keyword(self) -> int:
        return self.type_counts[ParsedType.KEYWORD]

    @property
    def is_number(self) -> bool:
        return self.type_counts[ParsedType.NUMBER] > 0

    @property
    def count_number(self) -> int:
        return self.type_counts[ParsedType.NUMBER]

    @property
    def is_date(self) -> bool:
        return self.type_counts[ParsedType.DATE] > 0

    @property
    def count_date(self) -> int:
        return self.type_counts[ParsedType.DATE]

    @property
    def is_boolean(self) -> bool:
        return self.type_counts[ParsedType.BOOLEAN] > 0

    @property
    def count_boolean(self) -> int:
        return self.type_counts[ParsedType.BOOLEAN]

    @property
    def is_geo(self) -> bool:
        # because records either get parsed without geo data or with geo point and geo
        # shape, we can just use geo point
        return self.type_counts[ParsedType.GEO_POINT] > 0

    @property
    def count_geo(self) -> int:
        # because records either get parsed without geo data or with geo point and geo
        # shape, we can just use geo point
        return self.type_counts[ParsedType.GEO_POINT]

add(type_names, count)

Add the given type count data to this field.

Parameters:

Name Type Description Default
type_names str

the types this field is seen as a string of their names separated by commas.

required
count int

the number of records with this combination of types

required
Source code in splitgill/indexing/fields.py
288
289
290
291
292
293
294
295
296
297
298
def add(self, type_names: str, count: int):
    """
    Add the given type count data to this field.

    :param type_names: the types this field is seen as a string of their names
        separated by commas.
    :param count: the number of records with this combination of types
    """
    self.count += count
    for raw_type in type_names.split(','):
        self.type_counts[ParsedType(raw_type)] += count

is_type(*parsed_types)

Checks if this field is an instance of one of the given parsed types.

Parameters:

Name Type Description Default
parsed_types ParsedType

the parsed types to be checked

()

Returns:

Type Description
bool

True if the field is an instance of one of the given parsed types, False if not

Source code in splitgill/indexing/fields.py
300
301
302
303
304
305
306
307
308
def is_type(self, *parsed_types: ParsedType) -> bool:
    """
    Checks if this field is an instance of one of the given parsed types.

    :param parsed_types: the parsed types to be checked
    :return: True if the field is an instance of one of the given parsed types,
        False if not
    """
    return any(self.type_counts[parsed_type] > 0 for parsed_type in parsed_types)

ParsedType

Bases: StrEnum

Enum representing the possible parsed data types a value can be indexed as.

It's generally recommended to not use these directly, but to use the convenience functions defined later in this module or in the search module.

Source code in splitgill/indexing/fields.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class ParsedType(StrEnum):
    """
    Enum representing the possible parsed data types a value can be indexed as.

    It's generally recommended to not use these directly, but to use the convenience
    functions defined later in this module or in the search module.
    """

    # the unparsed raw field value
    UNPARSED = '_u'
    # the number field
    NUMBER = '_n'
    # the date field
    DATE = '_d'
    # the boolean field
    BOOLEAN = '_b'
    # the text field
    TEXT = '_t'
    # the keyword case-insensitive field
    KEYWORD = '_k'
    # the geo point field (shape centroid, will always be a point)
    GEO_POINT = '_gp'
    # the geo shape field (full shape, could be point, linestring, or polygon)
    GEO_SHAPE = '_gs'

    def path_to(self, field: str, full: bool = True) -> str:
        """
        Creates and returns the parsed path to the field indexed with this type.

        :param field: the name (including dots if needed) of the field
        :param full: whether to prepend the parsed field name to the path or not
            (default: True)
        :return: the path
        """
        return parsed_path(field, self, full)

path_to(field, full=True)

Creates and returns the parsed path to the field indexed with this type.

Parameters:

Name Type Description Default
field str

the name (including dots if needed) of the field

required
full bool

whether to prepend the parsed field name to the path or not (default: True)

True

Returns:

Type Description
str

the path

Source code in splitgill/indexing/fields.py
73
74
75
76
77
78
79
80
81
82
def path_to(self, field: str, full: bool = True) -> str:
    """
    Creates and returns the parsed path to the field indexed with this type.

    :param field: the name (including dots if needed) of the field
    :param full: whether to prepend the parsed field name to the path or not
        (default: True)
    :return: the path
    """
    return parsed_path(field, self, full)

parsed_path(field, parsed_type=None, full=True)

Creates and returns the parsed path to the field indexed with the given parsed type. Optionally, the full path is created and therefore the result includes the "parsed" prefix. If no parsed_type is provided (i.e. parsed_type=None, the default), then the root path to the field in the parsed object is returned.

Parameters:

Name Type Description Default
field str

the name (including dots if needed) of the field

required
parsed_type Optional[ParsedType]

the parsed type (default: None)

None
full bool

whether to prepend the parsed field name to the path or not (default: True)

True

Returns:

Type Description
str

the path

Source code in splitgill/indexing/fields.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def parsed_path(
    field: str, parsed_type: Optional[ParsedType] = None, full: bool = True
) -> str:
    """
    Creates and returns the parsed path to the field indexed with the given parsed type.
    Optionally, the full path is created and therefore the result includes the "parsed"
    prefix. If no parsed_type is provided (i.e. parsed_type=None, the default), then the
    root path to the field in the parsed object is returned.

    :param field: the name (including dots if needed) of the field
    :param parsed_type: the parsed type (default: None)
    :param full: whether to prepend the parsed field name to the path or not (default:
        True)
    :return: the path
    """
    if parsed_type is not None:
        path = f'{field}.{parsed_type}'
    else:
        path = field

    if full:
        return f'{DocumentField.DATA}.{path}'
    else:
        return path