Skip to content

Parser

ParsedData

Bases: NamedTuple

A named tuple containing the parse result.

Source code in splitgill/indexing/parser.py
13
14
15
16
17
18
19
20
class ParsedData(NamedTuple):
    """
    A named tuple containing the parse result.
    """

    parsed: dict
    data_types: list
    parsed_types: list

parse(data, options)

Parse the given dict and return a ParsedData named tuple. This is the main entry point for parsing data for indexing.

Parameters:

Name Type Description Default
data dict

the dict to parse

required
options ParsingOptions

the parsing options

required

Returns:

Type Description
ParsedData

a ParsedData named tuple

Source code in splitgill/indexing/parser.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def parse(data: dict, options: ParsingOptions) -> ParsedData:
    """
    Parse the given dict and return a ParsedData named tuple. This is the main entry
    point for parsing data for indexing.

    :param data: the dict to parse
    :param options: the parsing options
    :return: a ParsedData named tuple
    """
    parsed, data_types, parsed_types = parse_dict(data, options, False)

    # compress the information in the parsed/data types lists so that each element has
    # the field path plus all parsed/data types that appear at that path (the
    # parsed/data types lists we get back from parse_dict contain each path plus only
    # one type per element). This is necessary because it ensures we can get an accurate
    # count of how many records had each field in them, and it's also more efficient for
    # Elasticsearch to handle (smaller doc to index which uses less space, and makes
    # aggregations faster as there are fewer unique values)
    parsed_types.sort()
    parsed_types = [
        f'{path}.{",".join(pt.rsplit(".", 1)[1] for pt in group)}'
        for path, group in groupby(parsed_types, lambda pt: pt.rsplit('.', 1)[0])
    ]

    data_types.sort()
    data_types = [
        f'{path}.{",".join(dt.rsplit(".", 1)[1] for dt in group)}'
        for path, group in groupby(data_types, lambda dt: dt.rsplit('.', 1)[0])
    ]

    return ParsedData(parsed, data_types, parsed_types)

parse_dict(data, options, check_geojson)

Parse the dict and return a ParsedData name tuple.

Parameters:

Name Type Description Default
data dict

dict to parse

required
options ParsingOptions

the parsing options

required
check_geojson bool

whether to check if the root dict is GeoJSON or not

required

Returns:

Type Description
ParsedData

a ParsedData named tuple

Source code in splitgill/indexing/parser.py
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def parse_dict(data: dict, options: ParsingOptions, check_geojson: bool) -> ParsedData:
    """
    Parse the dict and return a ParsedData name tuple.

    :param data: dict to parse
    :param options: the parsing options
    :param check_geojson: whether to check if the root dict is GeoJSON or not
    :return: a ParsedData named tuple
    """
    parsed = {}
    data_types = [f'{key}.{DataType.type_for(value)}' for key, value in data.items()]
    parsed_types = []

    if check_geojson:
        geo_data = match_geojson(data)
        if geo_data:
            parsed.update(geo_data)
            parsed_types.extend(geo_data.keys())

    for key, value in data.items():
        if isinstance(value, (dict, list)):
            if not value:
                continue
            if isinstance(value, dict):
                parsed[key], dts, pts = parse_dict(value, options, True)
            else:
                parsed[key], dts, pts = parse_list(value, options)
            data_types.extend(f'{key}.{dt}' for dt in dts)
            parsed_types.extend(f'{key}.{pt}' for pt in pts)
        else:
            if value is None:
                parsed[key] = {ParsedType.UNPARSED: None}
                continue
            if not str(value):
                parsed[key] = {ParsedType.UNPARSED: ''}
                continue
            parsed_value = parse_value(value, options)
            parsed[key] = parsed_value
            parsed_types.extend(f'{key}.{k}' for k in parsed_value.keys())

    hint_matches = match_hints(data, options.geo_hints)
    for key, geo_data in hint_matches.items():
        # we want to add the geo data to the key's parsed data but the parsed dict is
        # a cached response from parse_value, so we have to make a copy
        parsed[key] = {**parsed[key], **geo_data}
        parsed_types.extend(f'{key}.{k}' for k in geo_data.keys())

    return ParsedData(parsed, data_types, parsed_types)

parse_list(data, options)

Parse the given list and return a tuple similar to the ParsedData named tuple in form and identical in function.

Parameters:

Name Type Description Default
data list

the list to parse

required
options ParsingOptions

the parsing options

required

Returns:

Type Description
Tuple[list, set, set]

a list of parsed values, a set of parsed types, and a set of data types

Source code in splitgill/indexing/parser.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def parse_list(data: list, options: ParsingOptions) -> Tuple[list, set, set]:
    """
    Parse the given list and return a tuple similar to the ParsedData named tuple in
    form and identical in function.

    :param data: the list to parse
    :param options: the parsing options
    :return: a list of parsed values, a set of parsed types, and a set of data types
    """
    parsed: list = [None] * len(data)
    data_types = {f'.{DataType.type_for(value)}' for value in data}
    parsed_types = set()

    for index, value in enumerate(data):
        if isinstance(value, (dict, list)):
            if not value:
                continue
            if isinstance(value, dict):
                parsed[index], dts, pts = parse_dict(value, options, True)
            else:
                parsed[index], dts, pts = parse_list(value, options)
            data_types.update(f'.{dt}' for dt in dts)
            # elasticsearch completely flattens lists so when adding the parsed types we
            # just ignore the hierarchy and store the types directly in our set
            parsed_types.update(pts)
        else:
            if value is None or not str(value):
                continue
            parsed_value = parse_value(value, options)
            parsed[index] = parsed_value
            parsed_types.update(parsed_value.keys())

    return parsed, data_types, parsed_types

parse_value(value, options) cached

Parse a single value into a dict of typed values. As the typing suggests, this function only deals with ints, floats, strs, and bools. Don't pass it None, lists, or dicts!

The result from this function is cached for performance reasons.

Parameters:

Name Type Description Default
value Union[int, str, bool, float]

a value

required
options ParsingOptions

the parsing options

required

Returns:

Type Description
dict

a dict containing different parsed representations of the value

Source code in splitgill/indexing/parser.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
@lru_cache(maxsize=1_000_000, typed=True)
def parse_value(value: Union[int, str, bool, float], options: ParsingOptions) -> dict:
    """
    Parse a single value into a dict of typed values. As the typing suggests, this
    function only deals with ints, floats, strs, and bools. Don't pass it None, lists,
    or dicts!

    The result from this function is cached for performance reasons.

    :param value: a value
    :param options: the parsing options
    :return: a dict containing different parsed representations of the value
    """
    # create a string version of the value, we only need to do something special for
    # floats here as str(value) is sensible for int, bool, and obviously str
    if isinstance(value, float):
        str_value = options.float_format.format(value)
    else:
        str_value = str(value)

    # the always included values are used to set up the returned dict
    parsed = {
        ParsedType.UNPARSED: value,
        ParsedType.TEXT: str_value,
        ParsedType.KEYWORD: str_value[: options.keyword_length],
    }

    # check if the value is WKT geo data
    geo_data = match_wkt(str_value)
    if geo_data:
        parsed.update(geo_data)

    # check for boolean values
    if isinstance(value, bool):
        parsed[ParsedType.BOOLEAN] = value
    else:
        if str_value.lower() in options.true_values:
            parsed[ParsedType.BOOLEAN] = True
        elif str_value.lower() in options.false_values:
            parsed[ParsedType.BOOLEAN] = False

    # check for number values
    if not isinstance(value, bool) and isinstance(value, (int, float)):
        parsed[ParsedType.NUMBER] = value
    else:
        # attempt parsing the value as a number
        as_number = try_float(str_value, inf=None, nan=None, on_fail=None)
        if as_number is not None:
            parsed[ParsedType.NUMBER] = as_number

    # attempt to parse dates using the formats listed in the options, stop when we find
    # one that works
    for date_format in options.date_formats:
        try:
            parsed[ParsedType.DATE] = parse_to_timestamp(str_value, date_format)
            break
        except ValueError:
            pass

    return parsed