golang-saas-starter-kit/functions/ddlogscollector/lambda_function.py

# Unless explicitly stated otherwise all files in this repository are licensed
# under the Apache License Version 2.0.
# This product includes software developed at Datadog (https://www.datadoghq.com/).
# Copyright 2018 Datadog, Inc.

from __future__ import print_function

import base64
import gzip
import json
import os
import re
import socket
import ssl
import urllib
from io import BytesIO, BufferedReader

import boto3

# Proxy
# Define the proxy endpoint to forward the logs to
DD_SITE = os.getenv("DD_SITE", default="datadoghq.com")
DD_URL = os.getenv("DD_URL", default="lambda-intake.logs." + DD_SITE)

# Define the proxy port to forward the logs to
try:
    if "DD_SITE" in os.environ and DD_SITE == "datadoghq.eu":
        DD_PORT = int(os.environ.get("DD_PORT", 443))
    else:
        DD_PORT = int(os.environ.get("DD_PORT", 10516))
except Exception:
    DD_PORT = 10516

# Scrubbing sensitive data
# Option to redact all pattern that looks like an ip address / email address
try:
    is_ipscrubbing = os.environ["REDACT_IP"]
except Exception:
    is_ipscrubbing = False
try:
    is_emailscrubbing = os.environ["REDACT_EMAIL"]
except Exception:
    is_emailscrubbing = False

# DD_API_KEY: Datadog API Key
DD_API_KEY = "<your_api_key>"
if "DD_KMS_API_KEY" in os.environ:
    ENCRYPTED = os.environ["DD_KMS_API_KEY"]
    DD_API_KEY = boto3.client("kms").decrypt(
        CiphertextBlob=base64.b64decode(ENCRYPTED)
    )["Plaintext"]
elif "DD_API_KEY" in os.environ:
    DD_API_KEY = os.environ["DD_API_KEY"]

# Strip any trailing and leading whitespace from the API key
DD_API_KEY = DD_API_KEY.strip()

cloudtrail_regex = re.compile(
    "\d+_CloudTrail_\w{2}-\w{4,9}-\d_\d{8}T\d{4}Z.+.json.gz$", re.I
)
ip_regex = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", re.I)
email_regex = re.compile("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", re.I)

DD_SOURCE = "ddsource"
DD_CUSTOM_TAGS = "ddtags"
DD_SERVICE = "service"
DD_HOST = "host"
DD_FORWARDER_VERSION = "1.2.3"

# Pass custom tags as environment variable, ensure comma separated, no trailing comma in envvar!
DD_TAGS = os.environ.get("DD_TAGS", "")

class DatadogConnection(object):
    def __init__(self, host, port, ddApiKey):
        self.host = host
        self.port = port
        self.api_key = ddApiKey
        self._sock = None

    def _connect(self):
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s = ssl.wrap_socket(s)
        s.connect((self.host, self.port))
        return s

    def safe_submit_log(self, log, metadata):
        try:
            self.send_entry(log, metadata)
        except Exception as e:
            # retry once
            if self._sock:
                # make sure we don't keep old connections open
                self._sock.close()
            self._sock = self._connect()
            self.send_entry(log, metadata)
        return self

    def send_entry(self, log_entry, metadata):
        # The log_entry can only be a string or a dict
        if isinstance(log_entry, str):
            log_entry = {"message": log_entry}
        elif not isinstance(log_entry, dict):
            raise Exception(
                "Cannot send the entry as it must be either a string or a dict. Provided entry: "
                + str(log_entry)
            )

        # Merge with metadata
        log_entry = merge_dicts(log_entry, metadata)

        # Send to Datadog
        str_entry = json.dumps(log_entry)

        # Scrub ip addresses if activated
        if is_ipscrubbing:
            try:
                str_entry = ip_regex.sub("xxx.xxx.xxx.xx", str_entry)
            except Exception as e:
                print(
                    "Unexpected exception while scrubbing logs: {} for event {}".format(
                        str(e), str_entry
                    )
                )
        # Scrub email addresses if activated
        if is_emailscrubbing:
            try:
                str_entry = email_regex.sub("xxxxx@xxxxx.com", str_entry)
            except Exception as e:
                print(
                    "Unexpected exception while scrubbing logs: {} for event {}".format(
                        str(e), str_entry
                    )
                )

        # For debugging purpose uncomment the following line
        # print(str_entry)
        prefix = "%s " % self.api_key
        return self._sock.send((prefix + str_entry + "\n").encode("UTF-8"))

    def __enter__(self):
        self._sock = self._connect()
        return self

    def __exit__(self, ex_type, ex_value, traceback):
        if self._sock:
            self._sock.close()
        if ex_type is not None:
            print("DatadogConnection exit: ", ex_type, ex_value, traceback)


def lambda_handler(event, context):
    # Check prerequisites
    if DD_API_KEY == "<your_api_key>" or DD_API_KEY == "":
        raise Exception(
            "You must configure your API key before starting this lambda function (see #Parameters section)"
        )
    # Check if the API key is the correct number of characters
    if len(DD_API_KEY) != 32:
        raise Exception(
            "The API key is not the expected length. Please confirm that your API key is correct"
        )

    metadata = {"ddsourcecategory": "aws"}

    # create socket
    with DatadogConnection(DD_URL, DD_PORT, DD_API_KEY) as con:
        # Add the context to meta
        if "aws" not in metadata:
            metadata["aws"] = {}
        aws_meta = metadata["aws"]
        aws_meta["function_version"] = context.function_version
        aws_meta["invoked_function_arn"] = context.invoked_function_arn
        # Add custom tags here by adding new value with the following format "key1:value1, key2:value2"  - might be subject to modifications
        dd_custom_tags_data = {
            "forwardername": context.function_name.lower(),
            "memorysize": context.memory_limit_in_mb,
            "forwarder_version": DD_FORWARDER_VERSION,
        }
        metadata[DD_CUSTOM_TAGS] = ",".join(
            filter(
                None,
                [
                    DD_TAGS,
                    ",".join(
                        [
                            "{}:{}".format(k, v)
                            for k, v in dd_custom_tags_data.iteritems()
                        ]
                    ),
                ],
            )
        )

        try:
            logs = generate_logs(event, context, metadata)
            for log in logs:
                con = con.safe_submit_log(log, metadata)
        except Exception as e:
            print("Unexpected exception: {} for event {}".format(str(e), event))


def generate_logs(event, context, metadata):
    try:
        # Route to the corresponding parser
        event_type = parse_event_type(event)
        if event_type == "s3":
            logs = s3_handler(event, context, metadata)
        elif event_type == "awslogs":
            logs = awslogs_handler(event, context, metadata)
        elif event_type == "events":
            logs = cwevent_handler(event, metadata)
        elif event_type == "sns":
            logs = sns_handler(event, metadata)
    except Exception as e:
        # Logs through the socket the error
        err_message = "Error parsing the object. Exception: {} for event {}".format(
            str(e), event
        )
        logs = [err_message]
    return logs


# Utility functions


def parse_event_type(event):
    if "Records" in event and len(event["Records"]) > 0:
        if "s3" in event["Records"][0]:
            return "s3"
        elif "Sns" in event["Records"][0]:
            return "sns"

    elif "awslogs" in event:
        return "awslogs"

    elif "detail" in event:
        return "events"
    raise Exception("Event type not supported (see #Event supported section)")


# Handle S3 events
def s3_handler(event, context, metadata):
    s3 = boto3.client("s3")

    # Get the object from the event and show its content type
    bucket = event["Records"][0]["s3"]["bucket"]["name"]
    key = urllib.unquote_plus(event["Records"][0]["s3"]["object"]["key"]).decode("utf8")

    keyMetadata = parse_key_metadata(key)
    for k in keyMetadata :
        metadata[k] = keyMetadata[k]

    source = parse_event_source(event, key)
    metadata[DD_SOURCE] = source

    ##Get the ARN of the service and set it as the hostname
    if DD_HOST not in metadata.keys() :
        hostname = parse_service_arn(source, key, bucket, context)
        if hostname:
            metadata[DD_HOST] = hostname

    ##default service to source value
    if DD_SERVICE not in metadata.keys() :
        metadata[DD_SERVICE] = source

    # Extract the S3 object
    response = s3.get_object(Bucket=bucket, Key=key)
    body = response["Body"]
    data = body.read()

    # If the name has a .gz extension, then decompress the data
    if key[-3:] == ".gz":
        with gzip.GzipFile(fileobj=BytesIO(data)) as decompress_stream:
            # Reading line by line avoid a bug where gzip would take a very long time (>5min) for
            # file around 60MB gzipped
            data = "".join(BufferedReader(decompress_stream))

    if is_cloudtrail(str(key)):
        cloud_trail = json.loads(data)
        for event in cloud_trail["Records"]:
            # Create structured object and send it
            structured_line = merge_dicts(
                event, {"aws": {"s3": {"bucket": bucket, "key": key}}}
            )
            yield structured_line
    else:
        # Send lines to Datadog
        for line in data.splitlines():
            # Create structured object and send it
            structured_line = {
                "aws": {"s3": {"bucket": bucket, "key": key}},
                "message": line,
            }
            yield structured_line


# Handle CloudWatch logs
def awslogs_handler(event, context, metadata):
    # Get logs
    with gzip.GzipFile(
            fileobj=BytesIO(base64.b64decode(event["awslogs"]["data"]))
    ) as decompress_stream:
        # Reading line by line avoid a bug where gzip would take a very long
        # time (>5min) for file around 60MB gzipped
        data = "".join(BufferedReader(decompress_stream))
    logs = json.loads(str(data))

    # Set the source on the logs
    source = logs.get("logGroup", "cloudwatch")
    metadata[DD_SOURCE] = parse_event_source(event, source)

    # Default service to source value
    metadata[DD_SERVICE] = metadata[DD_SOURCE]

    # Build aws attributes
    aws_attributes = {
        "aws": {
            "awslogs": {
                "logGroup": logs["logGroup"],
                "logStream": logs["logStream"],
                "owner": logs["owner"],
            }
        }
    }

    # For Lambda logs we want to extract the function name,
    # then rebuild the arn of the monitored lambda using that name.
    # Start by splitting the log group to get the function name
    if metadata[DD_SOURCE] == "lambda":
        log_group_parts = logs["logGroup"].split("/lambda/")
        if len(log_group_parts) > 0:
            function_name = log_group_parts[1].lower()
            # Split the arn of the forwarder to extract the prefix
            arn_parts = context.invoked_function_arn.split("function:")
            if len(arn_parts) > 0:
                arn_prefix = arn_parts[0]
                # Rebuild the arn by replacing the function name
                arn = arn_prefix + "function:" + function_name
                # Add the arn as a log attribute
                arn_attributes = {"lambda": {"arn": arn}}
                aws_attributes = merge_dicts(aws_attributes, arn_attributes)
                # Add the function name as tag
                metadata[DD_CUSTOM_TAGS] += ",functionname:" + function_name
                # Set the arn as the hostname
                metadata[DD_HOST] = arn

    # Create and send structured logs to Datadog
    for log in logs["logEvents"]:
        yield merge_dicts(log, aws_attributes)


# Handle Cloudwatch Events
def cwevent_handler(event, metadata):

    data = event

    # Set the source on the log
    source = data.get("source", "cloudwatch")
    service = source.split(".")
    if len(service) > 1:
        metadata[DD_SOURCE] = service[1]
    else:
        metadata[DD_SOURCE] = "cloudwatch"
    ##default service to source value
    metadata[DD_SERVICE] = metadata[DD_SOURCE]

    yield data


# Handle Sns events
def sns_handler(event, metadata):

    data = event
    # Set the source on the log
    metadata[DD_SOURCE] = parse_event_source(event, "sns")

    for ev in data["Records"]:
        # Create structured object and send it
        structured_line = ev
        yield structured_line


def merge_dicts(a, b, path=None):
    if path is None:
        path = []
    for key in b:
        if key in a:
            if isinstance(a[key], dict) and isinstance(b[key], dict):
                merge_dicts(a[key], b[key], path + [str(key)])
            elif a[key] == b[key]:
                pass  # same leaf value
            else:
                raise Exception(
                    "Conflict while merging metadatas and the log entry at %s"
                    % ".".join(path + [str(key)])
                )
        else:
            a[key] = b[key]
    return a


def is_cloudtrail(key):
    match = cloudtrail_regex.search(key)
    return bool(match)


def parse_event_source(event, key):
    if "elasticloadbalancing" in key:
        return "elb"
    for source in [
        "lambda",
        "redshift",
        "cloudfront",
        "kinesis",
        "mariadb",
        "mysql",
        "apigateway",
        "route53",
        "vpc",
        "rds",
        "sns",
        "waf",
        "docdb",
        "ecs",
    ]:
        if source in key:
            return source
        if "API-Gateway" in key:
            return "apigateway"
        if is_cloudtrail(str(key)):
            return "cloudtrail"
        if "awslogs" in event:
            return "cloudwatch"
        if "Records" in event and len(event["Records"]) > 0:
            if "s3" in event["Records"][0]:
                return "s3"
    return "aws"

def parse_service_arn(source, key, bucket, context):
    if source == "elb":
        #For ELB logs we parse the filename to extract parameters in order to rebuild the ARN
        #1. We extract the region from the filename
        #2. We extract the loadbalancer name and replace the "." by "/" to match the ARN format
        #3. We extract the id of the loadbalancer
        #4. We build the arn
        keysplit = key.split("_")
        idsplit = key.split("/")
        if len(keysplit) > 3:
            region = keysplit[2].lower()
            name = keysplit[3]
            elbname = name.replace(".", "/")
            if len(idsplit) > 1:
                idvalue = idsplit[1]
                return "arn:aws:elasticloadbalancing:" + region + ":" + idvalue + ":loadbalancer/" + elbname
    if source == "s3":
        #For S3 access logs we use the bucket name to rebuild the arn
        if bucket:
            return "arn:aws:s3:::" + bucket
    if source == "cloudfront":
        #For Cloudfront logs we need to get the account and distribution id from the lambda arn and the filename
        #1. We extract the cloudfront id  from the filename
        #2. We extract the AWS account id from the lambda arn
        #3. We build the arn
        namesplit = key.split("/")
        if len(namesplit) > 0:
            filename = namesplit[len(namesplit)-1]
            #(distribution-ID.YYYY-MM-DD-HH.unique-ID.gz)
            filenamesplit = filename.split(".")
            if len(filenamesplit) > 3:
                distributionID = filenamesplit[len(filenamesplit)-4].lower()
                arn = context.invoked_function_arn
                arnsplit = arn.split(":")
                if len(arnsplit) == 7:
                    awsaccountID = arnsplit[4].lower()
                    return "arn:aws:cloudfront::" + awsaccountID+":distribution/" + distributionID
    if source == "redshift":
        #For redshift logs we leverage the filename to extract the relevant information
        #1. We extract the region from the filename
        #2. We extract the account-id from the filename
        #3. We extract the name of the cluster
        #4. We build the arn: arn:aws:redshift:region:account-id:cluster:cluster-name
        namesplit = key.split("/")
        if len(namesplit) == 8:
            region = namesplit[3].lower()
            accountID = namesplit[1].lower()
            filename = namesplit[7]
            filesplit = filename.split("_")
            if len(filesplit) == 6:
                clustername = filesplit[3]
                return "arn:aws:redshift:" + region + ":" + accountID + ":cluster:" + clustername
    return

def parse_key_metadata(key):
    metadata = {}
    keysplit = key.split("/")
    for k in keysplit :
        if "=" in k :
            prt = key.split("=")
            metadata[prt[0]] = prt[0]
        elif "_" in k :
            kn = key.split("_")[0]
            if kn in ["source", "cluster", "service", "env", "region", "host"]:
                metadata[kn] = k.replace(kn+"_", "")

    return metadata