WSL/SLF GitLab Repository

Commit e987d6f8 authored by Sam's avatar Sam
Browse files

refactor: update to envidat-utils instead of including utils package

parent a418e9e2
......@@ -3,7 +3,5 @@
# Allow files and directories
!*.py
!utils/**/*.py
!pyproject.toml
!pdm.lock
!__pypackages__/
......@@ -72,7 +72,7 @@ COPY --from=build \
/opt/python/pkgs
ENV PYTHONPATH="/opt/python/pkgs"
WORKDIR /opt/app
COPY main.py utils ./
COPY main.py ./
# Upgrade pip & pre-compile deps to .pyc, add appuser, permissions
RUN python -m pip install --no-cache-dir --upgrade pip \
......
......@@ -17,7 +17,6 @@ AWS_ENDPOINT=xxx
AWS_REGION=xxx
AWS_ACCESS_KEY=xxx
AWS_SECRET_KEY=xxx
AWS_BUCKET_NAME=xxx
```
2a. Local Debug
......
import os
import sys
import logging
import requests
import json
from typing import Optional, NoReturn
from pathlib import Path
from typing import Optional
from collections import OrderedDict
from dateutil.parser import parse
from xmltodict import unparse
from utils.s3 import (
get_s3_connection,
create_s3_bucket,
set_s3_static_config,
generate_index_html,
upload_to_s3_from_memory,
)
from envidat.api.v1 import get_metadata_list_with_resources
from envidat.s3.bucket import Bucket
from envidat.utils import get_logger, load_dotenv_if_in_debug_mode
log = logging.getLogger(__name__)
def _debugger_is_active() -> bool:
"""Check to see if running in debug mode."""
gettrace = getattr(sys, "gettrace", lambda: None)
return gettrace() is not None
def _load_debug_dotenv() -> NoReturn:
"""Load .env.secret variables from repo for debugging."""
from dotenv import load_dotenv
secret_env = Path(".env.secret")
if secret_env.is_file():
load_dotenv(secret_env)
def _get_logger() -> logging.basicConfig:
"""Set logger parameters with log level from environment."""
logging.basicConfig(
level=os.getenv("LOG_LEVEL", default="DEBUG"),
format=(
"%(asctime)s.%(msecs)03d [%(levelname)s] "
"%(name)s | %(funcName)s:%(lineno)d | %(message)s"
),
datefmt="%y-%m-%d %H:%M:%S",
stream=sys.stdout,
)
def _clean_text(text: str) -> str:
"""Returns text cleaned of hashes and with modified characters"""
......@@ -79,104 +40,6 @@ def _get_keywords(package: dict) -> list:
return keywords
def _get_url(url: str) -> requests.Response:
"""Helper wrapper to get a URL with additional error handling."""
try:
log.debug(f"Attempting to get {url}")
r = requests.get(url)
r.raise_for_status()
return r
except requests.exceptions.ConnectionError as e:
log.error(f"Could not connect to internet on get: {r.request.url}")
log.error(e)
except requests.exceptions.HTTPError as e:
log.error(f"HTTP response error on get: {r.request.url}")
log.error(e)
except requests.exceptions.RequestException as e:
log.error(f"Request error on get: {r.request.url}")
log.error(f"Request: {e.request}")
log.error(f"Response: {e.response}")
except Exception as e:
log.error(e)
log.error(f"Unhandled exception occured on get: {r.request.url}")
return None
def get_metadata_list(host: str = None, sort_result: bool = None) -> list:
"""
Get package/metadata list from API.
Host url as a parameter or from environment.
:param host: API host url. Attempts to get from environment if omitted.
:param sort_result: Sort result alphabetically by metadata name.
"""
if host is None:
host = os.getenv("API_HOST", default="https://www.envidat.ch")
log.info(f"Getting package list from {host}.")
try:
package_names = _get_url(f"{host}/api/3/action/package_list").json()
except AttributeError as e:
log.error(e)
log.error(f"Getting package names from API failed. Returned: {package_names}")
raise AttributeError("Failed to extract package names as JSON.")
log.debug("Extracting [result] key from JSON.")
package_names = list(package_names["result"])
log.info(f"Returned {len(package_names)} metadata entries from API.")
if sort_result:
log.debug("Sorting return alphabetically.")
package_names = sorted(package_names, reverse=False)
return package_names
def get_metadata_list_with_resources(host: str = None, sort_result: bool = None) -> list:
"""
Get package/metadata list with associated resources from API.
Host url as a parameter or from environment.
:param host: API host url. Attempts to get from environment if omitted.
:param sort_result: Sort result alphabetically by metadata name.
Note: uses limit 100000, otherwise returns only 10 results.
"""
if host is None:
host = os.getenv("API_HOST", default="https://www.envidat.ch")
log.info(f"Getting package list with resources from {host}.")
try:
package_names_with_resources = _get_url(
f"{host}/api/3/action/current_package_list_with_resources?limit=100000"
).json()
except AttributeError as e:
log.error(e)
log.error(
"Getting package names with resources from API failed. "
f"Returned: {package_names_with_resources}"
)
raise AttributeError("Failed to extract package names as JSON.")
log.debug("Extracting [result] key from JSON.")
package_names_with_resources = list(package_names_with_resources["result"])
log.info(f"Returned {len(package_names_with_resources)} metadata entries from API.")
if sort_result:
log.debug("Sorting return by nested 'name' key alphabetically.")
package_names_with_resources = sorted(
package_names_with_resources, key=lambda x: x["name"], reverse=False
)
return package_names_with_resources
def get_distribution_list(package: dict, package_name: str) -> list:
"""Return distribution_list created from package resources list and licence_id."""
......@@ -292,7 +155,10 @@ def get_distribution_list(package: dict, package_name: str) -> list:
def get_wrapper_dict(converted_packages: list) -> dict:
"""Returns wrapper dictionary (with catalog and root tags) for converted packages."""
"""
Returns wrapper dictionary (with catalog and root tags)
for converted packages.
"""
# Assign catalog_dict for header and converted_packages
catalog_dict = OrderedDict()
......@@ -369,12 +235,16 @@ def get_opendataswiss_ordered_dict(package: dict) -> Optional[OrderedDict]:
"publisher", ""
)
md_metadata_dict["dcat:Dataset"]["dct:publisher"] = {
"foaf:Organization": {"@rdf:about": "https://envidat.ch/#/about",
"foaf:name": publisher_name}
"foaf:Organization": {
"@rdf:about": "https://envidat.ch/#/about",
"foaf:name": publisher_name,
}
}
# landing page
md_metadata_dict["dcat:Dataset"]["dcat:landingPage"] = {"@rdf:resource": package_url}
md_metadata_dict["dcat:Dataset"]["dcat:landingPage"] = {
"@rdf:resource": package_url
}
# contact point (MANDATORY)
maintainer = json.loads(package.get("maintainer", "{}"))
......@@ -423,9 +293,12 @@ def get_opendataswiss_ordered_dict(package: dict) -> Optional[OrderedDict]:
keywords_list += [{"@xml:lang": "en", "#text": keyword}]
md_metadata_dict["dcat:Dataset"]["dcat:keyword"] = keywords_list
# Distribution - iterate through package resources and obtain package license (MANDATORY)
# Distribution - iterate through package resources and obtain
# package license (MANDATORY)
# Call get_distribution_list(package) to get distibution list
md_metadata_dict["dcat:Dataset"]["dcat:distribution"] = get_distribution_list(package, package_name)
md_metadata_dict["dcat:Dataset"]["dcat:distribution"] = get_distribution_list(
package, package_name
)
return md_metadata_dict
......@@ -447,8 +320,7 @@ def envidat_to_opendataswiss_converter() -> str:
converted_packages = []
# TODO remove sort_result, let it use default value of None
metadata_list = get_metadata_list_with_resources(sort_result=True)
metadata_list = get_metadata_list_with_resources()
# Try to convert packages to dictionaries compatible with OpenDataSwiss format
try:
......@@ -477,25 +349,21 @@ def envidat_to_opendataswiss_converter() -> str:
def main():
"""Main script logic."""
if _debugger_is_active():
_load_debug_dotenv()
_get_logger()
load_dotenv_if_in_debug_mode(env_file=".env.secret")
get_logger()
log.info("Starting main opendataswiss script.")
xml_data = envidat_to_opendataswiss_converter()
xml_name = "envidat_export_opendataswiss.xml"
s3_client = get_s3_connection()
bucket = create_s3_bucket(s3_client, public=True)
log.debug(f"Attempting upload of {xml_name} to S3 bucket.")
upload_to_s3_from_memory(bucket, xml_name, xml_data)
s3_bucket = Bucket(bucket_name="opendataswiss", is_new=True, is_public=True)
s3_bucket.put(xml_name, xml_data)
set_s3_static_config(s3_client)
index_html = generate_index_html("OpenDataSwiss XML", xml_name)
log.debug("Attempting upload of index.html to S3 bucket.")
upload_to_s3_from_memory(bucket, "index.html", index_html, content_type="text/html")
s3_bucket.configure_static_website()
s3_bucket.generate_index_html("EnviDat OpenDataSwiss XML", xml_name)
log.info("Done.")
log.info("Finished main opendataswiss script.")
if __name__ == "__main__":
......
[[package]]
name = "boto3"
version = "1.22.12"
version = "1.23.2"
requires_python = ">= 3.6"
summary = "The AWS SDK for Python"
dependencies = [
"botocore<1.26.0,>=1.25.12",
"botocore<1.27.0,>=1.26.2",
"jmespath<2.0.0,>=0.7.1",
"s3transfer<0.6.0,>=0.5.0",
]
[[package]]
name = "botocore"
version = "1.25.12"
version = "1.26.2"
requires_python = ">= 3.6"
summary = "Low-level, data-driven core of boto 3."
dependencies = [
......@@ -31,6 +31,27 @@ version = "2.0.12"
requires_python = ">=3.5.0"
summary = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
[[package]]
name = "envidat-utils"
version = "0.2.1"
requires_python = ">=3.9"
summary = "Utilities in Python for the WSL EnviDat project."
dependencies = [
"boto3>=1.22.12",
"requests==2.27.1",
]
[[package]]
name = "envidat-utils"
version = "0.2.0"
extras = ["dotenv"]
requires_python = ">=3.9"
summary = "Utilities in Python for the WSL EnviDat project."
dependencies = [
"envidat-utils>=0.2.0",
"python-dotenv>=0.20.0",
]
[[package]]
name = "idna"
version = "3.3"
......@@ -99,15 +120,16 @@ summary = "Makes working with XML feel like you are working with JSON"
[metadata]
lock_version = "3.1"
content_hash = "sha256:30fb3c469b3a614aeaf3250b26ec0ed7ee4661c0e389d62ed676a477ec97429b"
content_hash = "sha256:9576349b6caaa6226266cdd2dc10a43c4c2662643dfbe55b889edf668cc0b09a"
[metadata.files]
"boto3 1.22.12" = [
{file = "boto3-1.22.12-py3-none-any.whl", hash = "sha256:9830d7f8748c164a3f0929d8a0c5bb313cc62d7cf69ce55617108bed451a8520"},
{file = "boto3-1.22.12.tar.gz", hash = "sha256:4b3a49abf7a5f7cdd82714a3ae356a9a8ce12a668e014c5fc68454aa1e2fc0cb"},
"boto3 1.23.2" = [
{file = "boto3-1.23.2-py3-none-any.whl", hash = "sha256:7889c3a07171b8a43468a8644d7c95948dc9e1389c4aac2b689a428ee1a98300"},
{file = "boto3-1.23.2.tar.gz", hash = "sha256:4408cf07340d29d7a9c8d32cf71b1c54f86b768b2145d341d2698c1e467d7d32"},
]
"botocore 1.25.12" = [
{file = "botocore-1.25.12-py3-none-any.whl", hash = "sha256:53e19890124be45e47ec4f7ffdaf587343d375dbd7c7a501e55aeff80680fec0"},
"botocore 1.26.2" = [
{file = "botocore-1.26.2-py3-none-any.whl", hash = "sha256:1977f2ad6b6263f4dd9e8b784e69b194988f16d6bd90c4eede15964f4eecf878"},
{file = "botocore-1.26.2.tar.gz", hash = "sha256:16b9d523a19d61b0edc80ef2253f9130165bad473b1b5707027f10975a8d5467"},
]
"certifi 2021.10.8" = [
{file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"},
......@@ -117,6 +139,14 @@ content_hash = "sha256:30fb3c469b3a614aeaf3250b26ec0ed7ee4661c0e389d62ed676a477e
{file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"},
{file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"},
]
"envidat-utils 0.2.1" = [
{file = "envidat_utils-0.2.1-py3-none-any.whl", hash = "sha256:79b892e56174a1971a3d6798a082c19a8fa3767401518a9862e5e3f9115a67a2"},
{file = "envidat-utils-0.2.1.tar.gz", hash = "sha256:a1c4f894eeb22ddf263dcd98612dee203b2402eb06f129dd0a3086699b588741"},
]
"envidat-utils 0.2.0" = [
{file = "envidat_utils-0.2.0-py3-none-any.whl", hash = "sha256:b490bae59403cc95ef2d85d4e4d7bc1c9e613090fb0e981bff0b5631e39e8d9b"},
{file = "envidat-utils-0.2.0.tar.gz", hash = "sha256:dd42a36407153580b0181f4ba839435634ab11fffccaa1f7d560c638fe3ecd5e"},
]
"idna 3.3" = [
{file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
{file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
......
......@@ -7,24 +7,15 @@ authors = [
{name= "Rebecca Kurup Buchholz", email = "None"}
]
dependencies = [
"requests==2.27.1",
"envidat-utils[dotenv]>=0.2.1",
"xmltodict>=0.13.0",
"python-dateutil>=2.8.2",
"boto3>=1.22.12",
]
requires-python = ">=3.9"
license = {text = "MIT"}
[project.optional-dependencies]
[tool.pdm]
[[tool.pdm.source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[tool.pdm.dev-dependencies]
dev = [
"python-dotenv>=0.20.0",
]
[build-system]
requires = ["pdm-pep517"]
......
import os
import logging
import json
import boto3
from typing import Any, NoReturn, Union
from io import BytesIO
from textwrap import dedent
from botocore.config import Config
log = logging.getLogger(__name__)
def get_s3_connection() -> "boto3.resource":
"Get an S3 resource client using S3 credentials in environment."
log.debug(
f"Initialise S3 resource client with endpoint: {os.getenv('AWS_ENDPOINT')}"
)
s3 = boto3.resource(
"s3",
aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
aws_secret_access_key=os.getenv("AWS_SECRET_KEY"),
endpoint_url=os.getenv("AWS_ENDPOINT"),
region_name=os.getenv("AWS_REGION", default=""),
config=Config(signature_version="s3v4"),
)
return s3
def get_s3_bucket(
s3: "boto3.resource", bucket_name: str = None
) -> "boto3.resource.Bucket":
"Get an S3 bucket."
if bucket_name is None:
log.debug("Getting bucket name from environment variable.")
bucket_name = os.getenv("AWS_BUCKET_NAME")
try:
log.debug("Getting bucket...")
bucket = s3.Bucket(bucket_name)
except Exception as e:
log.error(e)
log.error(f"Failed to access bucket named: {bucket_name}")
bucket = None
return bucket
def create_s3_bucket(
s3: "boto3.resource", bucket_name: str = None, public: bool = False
) -> dict:
"""Create a new S3 bucket.
Response Syntax:
{
'Location': 'string'
}
"""
if bucket_name is None:
log.debug("Getting bucket name from environment variable.")
bucket_name = os.getenv("AWS_BUCKET_NAME")
try:
log.debug("Creating bucket...")
bucket = s3.create_bucket(
ACL="public-read" if public else "private",
Bucket=bucket_name,
CreateBucketConfiguration={"LocationConstraint": "zh"},
ObjectLockEnabledForBucket=False,
)
log.debug(f"Created bucket: {bucket_name}")
return bucket
except s3.meta.client.exceptions.BucketAlreadyExists as e:
log.error(e)
log.error(f"Bucket named {bucket_name} already exists. Creation failed.")
except Exception as e:
log.error(e)
log.error(f"Failed to create bucket named: {bucket_name}")
return None
def download_s3_object_to_memory(path: str, bucket: "boto3.resource.Bucket") -> BytesIO:
"""
Download an S3 object into a binary memory object.
To use:
download_s3_object_to_memory(
"index.html"
).read().decode("utf_8")
"""
log.debug(f"Attempting download of key: {path} to memory.")
file = bucket.Object(path)
buf = BytesIO()
try:
file.download_fileobj(buf)
log.info(f"Successful download: {path}")
except Exception as e:
log.error(e)
log.error(f"Failed to download {path}")
buf.seek(0)
return buf.read().decode("utf_8")
def upload_to_s3_from_memory(
bucket: "boto3.resource.Bucket", key: str, data: Any, content_type: str = None
) -> bool:
"Upload memory object to S3 bucket."
buf = BytesIO()
log.debug("Writing memory object buffer.")
buf.write(data.encode("utf_8"))
buf.seek(0)
try:
extra_args = {"ContentType": content_type} if content_type else None
bucket.upload_fileobj(buf, key, ExtraArgs=extra_args)
log.info(f"Successful upload: {key}")
except Exception as e:
log.error(e)
log.error(f"Failed to upload {key}")
return False
return True
def set_s3_static_config(s3: "boto3.resource", bucket_name: str = None) -> NoReturn:
"""
Add static website hosting config to an S3 bucket.
Note: WARNING this will set all data to public read policy.
"""
if bucket_name is None:
log.debug("Getting bucket name from environment variable.")
bucket_name = os.getenv("AWS_BUCKET_NAME")
try:
log.debug("Setting public read access policy for static website.")
public_policy = {
"Version": "2012-10-17",
"Statement": [
{
"Sid": "PublicRead",
"Effect": "Allow",
"Principal": "*",
"Action": "s3:GetObject",
"Resource": f"arn:aws:s3:::{bucket_name}/*",
}
],
}
bucket_policy = json.dumps(public_policy)
s3.meta.client.put_bucket_policy(Bucket=bucket_name, Policy=bucket_policy)
log.debug("Setting S3 static website configuration...")
s3.meta.client.put_bucket_website(
Bucket=bucket_name,
WebsiteConfiguration={
"ErrorDocument": {
"Key": "error.html",
},
"IndexDocument": {
"Suffix": "index.html",
},
},
)
log.debug(f"Static website configured for bucket: {bucket_name}")
except Exception as e:
log.error(e)
log.error(f"Failed to set static hosting on bucket named: {bucket_name}")
def generate_index_html(
title: str, file_list: Union[list, str], bucket_name: str = None
) -> BytesIO:
"Write index.html to root of S3 bucket, with embedded S3 download links."
if bucket_name is None:
log.debug("Getting bucket name from environment variable.")
bucket_name = os.getenv("AWS_BUCKET_NAME")
if isinstance(file_list, str):
file_list = [file_list]
buf = BytesIO()
# Start HTML
html_block = dedent(
f"""
<html>
<head>
<meta charset="utf-8">
<title>{title}</title>
</head>
<body>
"""
).strip()
log.debug(f"Writing start HTML block to buffer: {html_block}")
buf.write(html_block.encode("utf_8"))
# Files
log.info("Iterating file list to write S3 links to index.")
for file_name in file_list:
log.debug(f"File name: {file_name}")
html_block = dedent(
f"""
<div class='flex py-2 xs6'>
<a href='https://{bucket_name}.s3-zh.os.switch.ch/{file_name}'>
https://{bucket_name}.s3-zh.os.switch.ch/{file_name}
</a>
</div>"""
)
log.debug(f"Writing file link HTML to buffer: {html_block}")
buf.write(html_block.encode("utf_8"))
# Close
html_block = dedent(
"""
</body>
</html>"""
)
log.debug(f"Writing end HTML block to buffer: {html_block}")
buf.write(html_block.encode("utf_8"))
buf.seek(0)
return buf.read().decode("utf_8")