WSL/SLF GitLab Repository

nead_import.py 11.5 KB
Newer Older
1
#
2
# TODO test running script with multiple stations and from a batch file
3
# TODO FUTURE DEVELOPMENT refactor 'tempdir' argument and storing data downloaded from web rather than temporary file
4
#
5
# Summary: Command used to import data in NEAD format into database.
6
#
7
8
9
# WARNING: Existing records (selected by field 'timestamp') WILL BE UPDATED with the data in the input file,
#          records that do not exist will be created!!!!
#
10
11
12
13
14
# WARNING: To insure that 'timestamp_iso' values will be aware datetime objects with time zone 'UTC',
#          check that project/settings.py has the following settings:
#               USE_TZ = True
#               TIME_ZONE = 'UTC'
#
15
16
17
18
19
20
21
22
23
24
25
26
# Usage: To see information about command arguments see method add_arguments() or
#        open terminal (with virual environment activated) and run:
#               python manage.py nead_import --help
#
# Author: Rebecca Kurup Buchholz, WSL
# Date last modified: June 10, 2022
#
# Example command:
#       python manage.py nead_import -s local -i gcnet/data/01-SwissCamp.csv -n -999 -a gcnet -m test


import importlib
27
import os
28

29
30
from pathlib import Path
import requests
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
31

32
33
34
from django.core.management.base import BaseCommand
from django.utils.timezone import make_aware
from django.apps import apps
35
36
37
from django.forms import model_to_dict

from deepdiff import DeepDiff
38

39
from gcnet.management.commands.importers.helpers.cleaners import get_gcnet_record_clean
40
41
42

# Setup logging
import logging
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
43
44
45


def setup_logger(logger_name, log_file, level=logging.DEBUG):
46
47

    # Get logger
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
48
49
    log = logging.getLogger(logger_name)

50
    # Configure file handler
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
51
52
53
54
55
    fileHandler = logging.FileHandler(log_file, mode='a')
    formatter = logging.Formatter("%(asctime)s.%(msecs)03d [%(levelname)s] "
                                  "%(name)s | %(funcName)s:%(lineno)d | %(message)s")
    fileHandler.setFormatter(formatter)

56
    # Configure log
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
57
    log.addHandler(fileHandler)
58
59
    log.propagate = False
    log.setLevel(level)
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
60

61
# logger used for general logging
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
62
63
setup_logger('logger', 'generic/logs/nead_import.log')
logger = logging.getLogger('logger')
64
65
66

# updater_logger used specifically to log updated records
setup_logger('updater_logger', 'generic/logs/nead_import_updater.log')
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
67
updater_logger = logging.getLogger('updater_logger')
68
69


70

71
72
73
74
75
class Command(BaseCommand):

    def add_arguments(self, parser):

        parser.add_argument(
76
77
            '-s',
            '--source',
78
            required=True,
79
            help='Input data source, valid options are a file path: "local" or a url: "web"'
80
81
82
        )

        parser.add_argument(
83
84
            '-i',
            '--inputfile',
85
            required=True,
86
            help='Path or URL to input NEAD file'
87
88
89
        )

        parser.add_argument(
90
91
            '-n',
            '--nullvalue',
92
            required=True,
93
            help='Null value in input NEAD file'
94
95
96
97
98
99
100
101
102
103
104
105
106
        )

        parser.add_argument(
            '-a',
            '--app',
            required=True,
            help='App that Django model belongs to'
        )

        parser.add_argument(
            '-m',
            '--model',
            required=True,
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
            help='Django model to import input data into'
        )

        parser.add_argument(
            '-t',
            '--tempdir',
            help='OPTIONAL argument: Path to directory which will contain temporary downloaded file if using "web" '
                 'option for "source" argument'
        )

        parser.add_argument(
            '-d',
            '--dateformat',
            default='%Y-%m-%d %H:%M:%S+00:00',
            help='OPTIONAL argument: Date format of timestamp_iso values, '
                 'default argument is in Python datetime.strptime format'
123
124
125
126
        )

    def handle(self, *args, **kwargs):

127
        # Assign kwargs from command to variables
128
129
130
        inputfile = kwargs['inputfile']
        app = kwargs['app']
        model = kwargs['model']
131
132
        source = kwargs['source']
        tempdir = kwargs['tempdir']
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

        # Validate app
        if not apps.is_installed(app):
            logger.error(f'ERROR app {app} not found')
            return

        # Validate model
        try:
            model_class = self.get_model_cl(app, model)
        except AttributeError as e:
            logger.error(f'ERROR model {model} not found, exception {e}')
            return

        # Get line cleaner function that corresponds to parent class
        parent_class_name = model_class.__base__.__name__
        line_cleaner = self.get_line_cleaner(parent_class_name)
149

150
        # Validate inputfile
151
152
153
154
155
156
        if source == 'web':
            # Write content from url into csv file
            url = str(inputfile)
            logger.info(f'STARTED importing input URL: {url}')
            req = requests.get(url)
            url_content = req.content
157
158
            # TODO test this with storing web data in memory rather than downloading data
            # TODO remove tempdir argument and code references if using memory variable
159
160
161
162
163
164
165
166
167
168
169
            input_file = Path(f'{tempdir}/{model}_downloaded.csv')
            csv_file = open(input_file, 'wb')
            csv_file.write(url_content)
            csv_file.close()
        elif source == 'local':
            input_file = Path(inputfile)
            logger.info(f'STARTED importing input file: {input_file}')
        else:
            logger.error(f'ERROR non-valid value entered for "source": {source}')
            return

170
171
172
173
        # Get NEAD database fields
        with open(input_file) as source:
            nead_database_fields = self.get_nead_database_fields(source)

174
        # Update database with input_file data cleaned with line_cleaner function
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
175
176
        records_updated, records_created = self.update_database(input_file, nead_database_fields, line_cleaner,
                                                                model_class, **kwargs)
177
178
179
        logger.info(f'FINISHED importing {input_file} into model {model}: '
                    f'{records_created} new records created; '
                    f'{records_updated} existing records updated')
180
181
182
183
184

        # If file downloaded from web delete it
        if os.path.isfile(f'{tempdir}/{model}_downloaded.csv'):
            os.remove(f'{tempdir}/{model}_downloaded.csv')

185
186
187
    # Write data in input_file into database after transorming data with line_cleaner function
    # Updates existing records (identified by 'timestamp' field)
    # or creates new records if timestamp does not exist
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
188
    # Returns number of existing records updated and number of new records created
189
    def update_database(self, input_file: str, nead_database_fields: list, line_cleaner, model_class, **kwargs):
190
191
192
193

        # Assign kwargs from command to variables
        null_value = kwargs['nullvalue']
        dateformat = kwargs['dateformat']
194
        model = kwargs['model']
195

196
        try:
197

198
            with open(input_file) as source:
199

200
201
                # Initalize counters
                line_number = 0
202
203
                records_updated = 0
                records_created = 0
204
205
206
207
208
209
210
211

                while True:

                    line_number += 1
                    line = source.readline()
                    if not line:
                        break

Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
212
                    # Only used during testing, otherwise comment out
213
                    # if line_number > 26:
214
                    #     break
215

216
217
218
219
                    # Skip header comment lines that start with '#' or are empty
                    if line.startswith('#') or (len(line.strip()) == 0):
                        continue

Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
220
                    # Transform the line into a dictionary
221
                    row = self.dict_from_csv_line(line, nead_database_fields, sep=',')
222
223
224

                    # Raise ValueError if row does not have as many columns as nead_database_fields
                    if not row:
225
226
                        raise ValueError(f'ERROR: Line number  {line_number} should have the same number of columns as '
                                         f'the nead_database_fields: {len(nead_database_fields)}')
227
228

                    # Process row and add new calculated fields
229
                    line_clean = line_cleaner(row, dateformat, null_value)
230

231
                    # Make line_clean['timestamp_iso'] a UTC timezone aware datetime object
232
                    line_clean['timestamp_iso'] = make_aware(line_clean['timestamp_iso'])
233

234
                    # Update record if it exists (selected by field 'timestamp'), else create new record
235
236
237
                    if line_clean['timestamp']:
                        try:
                            timestamp_dict = {'timestamp': line_clean['timestamp']}
238
                            obj = model_class.objects.get(**timestamp_dict)
239
240
                            old_obj_dict = model_to_dict(obj, exclude='id')

241
242
243
244
                            for key, value in line_clean.items():
                                setattr(obj, key, value)
                            obj.save()
                            records_updated += 1
245
246
247
                            new_obj_dict = model_to_dict(obj, exclude='id')

                            # Log difference between old record and update record
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
248
249
                            difference_dict = DeepDiff(old_obj_dict, new_obj_dict)
                            if difference_dict:
250
251
252
                                timestamp_iso = line_clean['timestamp_iso']
                                updater_logger.info(f'UPDATED record difference for model {model}, '
                                                    f'timestamp {timestamp_iso}:  {difference_dict}')
253

254
255
256
257
258
259
                        except model_class.DoesNotExist:
                            obj = model_class(**line_clean)
                            obj.save()
                            records_created += 1

                return records_updated, records_created
260

261
262
263
        except FileNotFoundError as e:
            logger.error(f'ERROR file not found {input_file}, exception {e}')
            return
Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
264

265
266
267
268
    # Check which kind of line cleaner should be used
    @staticmethod
    def get_line_cleaner(parent_class_name):

Rebecca Kurup Buchholz's avatar
Rebecca Kurup Buchholz committed
269
        if parent_class_name == 'Station':
270
            return get_gcnet_record_clean
271

272
273
274
275
        else:
            raise Exception(f'ERROR parent class {parent_class_name} does not exist '
                            f'or is not specified in nead_import.py')

276
    # Read NEAD header and return database_fields as list
277
    @staticmethod
278
    def get_nead_database_fields(source):
279

280
        fields_starting_strings = ['# database_fields =', '#database_fields=', '# database_fields=']
281
282
283
284
285
286
287
288
289
        fields_lines = []

        for line in source:
            if any(item in line for item in fields_starting_strings):
                fields_lines.append(line)

        if len(fields_lines) == 1:
            fields_string = fields_lines.pop()
            fields_values = ((fields_string.split('=', 1))[1]).strip().rstrip('\n')
290
291
            database_fields = fields_values.split(',')
            return database_fields
292
293
294
        else:
            raise Exception(f'ERROR input NEAD file does not have exactly one line starting with one of these values:'
                            f' {fields_starting_strings}')
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312

    # Returns dictionary with header as keys and line as values
    # If the the number of header and line values do not match then returns None
    @staticmethod
    def dict_from_csv_line(line, header, sep=','):

        line_array = [v.strip() for v in line.strip().split(sep)]

        if len(line_array) != len(header):
            return None

        return {header[i]: line_array[i] for i in range(len(line_array))}

    # Returns model class without parent_class
    @staticmethod
    def get_model_cl(app, model):
        package = importlib.import_module(f'{app}.models')
        return getattr(package, model)