diff --git a/README.md b/README.md index f3768f0..96fba81 100644 --- a/README.md +++ b/README.md @@ -253,6 +253,7 @@ A collection of delicious docker recipes. - [x] tutum/builder - [x] browserless/chrome - [x] certbot +- [x] streamsets/datacollector - [x] cachethq/docker - [x] puckel/docker-airflow - [x] drone/drone diff --git a/sdc/README.md b/sdc/README.md new file mode 100644 index 0000000..9e8eaf3 --- /dev/null +++ b/sdc/README.md @@ -0,0 +1,11 @@ +sdc +=== + +```bash +$ docker-compose up -d +$ docker-compose exec sdc id +uid=20159(sdc) gid=20159(sdc) groups=20159(sdc) +$ chown 20159:20159 data/*.properties +$ chmod 600 data/form-realm.properties +$ docker-compose restart +``` diff --git a/sdc/data/form-realm.properties b/sdc/data/form-realm.properties new file mode 100644 index 0000000..e1996a1 --- /dev/null +++ b/sdc/data/form-realm.properties @@ -0,0 +1,46 @@ +# +# Copyright 2017 StreamSets Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +#The format is +# : MD5:[,user,,,...,,,....] +# +# Supported roles are: admin, manager, creator, guest +# +# 'user' must always be present +# +# Prefix with 'group:' for group information for the user. +# + +# FORM authentication, password is same as user name +admin: MD5:21232f297a57a5a743894a0e4a801fc3,user,admin +guest: MD5:084e0343a0486ff05530df6c705c8bb4,user,guest +creator: MD5:ee2433259b0fe399b40e81d2c98a38b6,user,creator +manager: MD5:1d0258c2440a8d19e716292b231e3190,user,manager +user1: MD5:24c9e15e52afc47c225b757e7bee1f9d,user,manager,creator,group:dev +user2: MD5:7e58d63b60197ceb55a1c487989a3720,user,manager,creator,group:dev +user3: MD5:92877af70a45fd6a2ed7fe81e1236b78,user,manager,creator,group:test +user4: MD5:3f02ebe3d7929b091e3d8ccfde2f3bc6,user,manager,creator,group:test + + +# +# To compute the MD5 run the following command: +# +# OSX: +# $ echo -n "" | md5 +# +# Linux: +# $ echo -n "" | md5sum +# diff --git a/sdc/data/sdc.properties b/sdc/data/sdc.properties new file mode 100644 index 0000000..a07ff2d --- /dev/null +++ b/sdc/data/sdc.properties @@ -0,0 +1,455 @@ +# +# Copyright 2017 StreamSets Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# HTTP configuration + +# The base URL of the datacollector, used to create email alert messages. +# If not set http://: is used +# is either taken from http.bindHost or resolved using +# 'hostname -f' if not configured. +#sdc.base.http.url=http://: + +# Hostname or IP address that data collector will bind to. +# Default is 0.0.0.0 that will bind to all interfaces. +#http.bindHost=0.0.0.0 + +# Maximum number of HTTP servicing threads. +#http.maxThreads=200 + +# The port the data collector runs the SDC HTTP endpoint. +# If different that -1, the SDC will run on this port +# If 0, the SDC will pick up a random port +# If the https.port is different that -1 or 0 and http.port is different than -1 or 0, the HTTP endpoint +# will redirect to the HTTPS endpoint. +http.port=18630 + +# HTTPS configuration + +# The port the data collector runs the SDC HTTPS endpoint. +# If different that -1, the SDC will run over SSL on this port +# If 0, the SDC will pick up a random port +https.port=-1 + +# Enables HTTP/2 support for the SDC UI/REST API. If you are using any clients +# that do not support ALPN for protocol negotiation, leave this option disabled. +http2.enable=false + +# Reverse Proxy / Load Balancer configuration + +# SDC will handle X-Forwarded-For, X-Forwarded-Proto, X-Forwarded-Port +# headers issued by a reverse proxy such as HAProxy, ELB, nginx when set to true. +# Set to true when hosting SDC behind a reverse proxy / load balancer. +http.enable.forwarded.requests=false + +# Java keystore file, in the SDC 'etc/' configuration directory +https.keystore.path=keystore.jks + +# Password for the keystore file, +# By default, the password is loaded from the 'keystore-password.txt' +# from the SDC 'etc/' configuration directory +https.keystore.password=${file("keystore-password.txt")} + +# Path to keystore file on worker node. This should always be an absolute location +https.cluster.keystore.path=/opt/security/jks/sdc-keystore.jks + +# Password for keystore file on worker +https.cluster.keystore.password=${file("/opt/security/jks/keystore-password.txt")} + +# Truststore configs +# By default, if below configs are commented then cacerts from JRE lib directory will be used as truststore + +# Java truststore file on gateway sdc which stores certificates to trust identity of workers +#https.truststore.path= + +# Password for truststore file +#https.truststore.password= + +# Path to truststore file on worker node. This should always be an absolute location +#https.cluster.truststore.path=/opt/security/jks/sdc-truststore.jks + +# Password for truststore file on worker +#https.cluster.truststore.password=${file("/opt/security/jks/truststore-password.txt")} + +# HTTP Session Timeout +# Max period of inactivity, after which the HTTP session is invalidated, in seconds. +# Default value is 86400 seconds (24 hours) +# value -1 means no timeout +http.session.max.inactive.interval=86400 + +# The authentication for the HTTP endpoint of the data collector +# Valid values are: 'none', 'basic', 'digest', or 'form' +# +http.authentication=form + +# Authentication Login Module +# Valid values are: 'file' and 'ldap' +# For 'file', the authentication and role information is read from a property file (etc/basic-realm.properties, +# etc/digest-realm.properties or etc/form-realm.properties based on the 'http.authentication' value). +# For 'ldap', the authentication and role information is read from a LDAP Server +# and LDAP connection information is read from etc/ldap-login.conf. +http.authentication.login.module=file + +# The realm used for authentication +# A file with the realm name and '.properties' extension must exist in the data collector configuration directory +# If this property is not set, the realm name is '-realm' +#http.digest.realm=local-realm + +# Check the permissions of the realm file should be owner only +http.realm.file.permission.check=true + +# LDAP group to Data Collector role mapping +# the mapping is specified as the following pattern: +# :(,)*(;:(,)*)* +# e.g. +# Administrator:admin;Manager:manager;DevOP:creator;Tester:guest; +http.authentication.ldap.role.mapping= + +# LDAP login module name as present in the JAAS config file. +# If no value is specified, the login module name is assumed to be "ldap" +ldap.login.module.name=ldap + +# HTTP access control (CORS) +http.access.control.allow.origin=* +http.access.control.allow.headers=origin, content-type, cache-control, pragma, accept, authorization, x-requested-by, x-ss-user-auth-token, x-ss-rest-call +http.access.control.allow.methods=GET, POST, PUT, DELETE, OPTIONS, HEAD + +# Runs the data collector within a Kerberos session which is propagated to all stages. +# This is useful for stages that require Kerberos authentication with the services they interact with +kerberos.client.enabled=false + +# The kerberos principal to use for the Kerberos session. +# It should be a service principal. If the hostname part of the service principal is '_HOST' or '0.0.0.0', +# the hostname will be replaced with the actual complete hostname of the data collector as advertised by the +# unix command 'hostname -f' +kerberos.client.principal=sdc/_HOST@EXAMPLE.COM + +# The location of the keytab file for the specified principal. If the path is relative, the keytab file will be +# looked under the data collector configuration directory +kerberos.client.keytab=sdc.keytab + +preview.maxBatchSize=10 +preview.maxBatches=10 + +production.maxBatchSize=1000 + +#Specifies the buffer size for Overrun parsers - including JSON, XML and CSV. +#This parameter is specified in bytes, and must be greater than +#1048576 bytes (which is the default size). +#parser.limit=5335040 + +#This option determines the number of error records, per stage, that will be retained in memory when the pipeline is +#running. If set to zero, error records will not be retained in memory. +#If the specified limit is reached the oldest records will be discarded to make room for the newest one. +production.maxErrorRecordsPerStage=100 + +#This option determines the number of pipeline errors that will be retained in memory when the pipeline is +#running. If set to zero, pipeline errors will not be retained in memory. +#If the specified limit is reached the oldest error will be discarded to make room for the newest one. +production.maxPipelineErrors=100 + +# Max number of concurrent REST calls allowed for the /rest/v1/admin/log endpoint +max.logtail.concurrent.requests=5 + +# Max number of concurrent WebSocket calls allowed +max.webSockets.concurrent.requests=15 + +# Pipeline Sharing / ACLs +pipeline.access.control.enabled=false + +# Customize header title for SDC UI +# You can pass any HTML tags here +# Example: +# For Text - New Brand Name +# For Image - +ui.header.title= + +ui.local.help.base.url=/docs +ui.hosted.help.base.url=https://www.streamsets.com/documentation/datacollector/3.8.0-SNAPSHOT/userguide/help + +ui.refresh.interval.ms=2000 +ui.jvmMetrics.refresh.interval.ms=4000 + +# If true SDC UI will use WebSocket to fetch pipeline status/metrics/alerts otherwise UI will poll every few seconds +# to get the Pipeline status/metrics/alerts. +ui.enable.webSocket=true + +# Number of changes supported by undo/redo functionality. +# UI archives Pipeline Configuration/Rules in browser memory to support undo/redo functionality. +ui.undo.limit=10 + +# SMTP configuration to send alert emails +# All properties starting with 'mail.' are used to create the JavaMail session, supported protocols are 'smtp' & 'smtps' +mail.transport.protocol=smtp +mail.smtp.host=localhost +mail.smtp.port=25 +mail.smtp.auth=false +mail.smtp.starttls.enable=false +mail.smtps.host=localhost +mail.smtps.port=465 +mail.smtps.auth=false +# If 'mail.smtp.auth' or 'mail.smtps.auth' are to true, these properties are used for the user/password credentials, +# ${file("email-password.txt")} will load the value from the 'email-password.txt' file in the config directory (where this file is) +xmail.username=foo +xmail.password=${file("email-password.txt")} +# FROM email address to use for the messages +xmail.from.address=sdc@localhost + +#Indicates the location where runtime configuration properties can be found. +#Value 'embedded' implies that the runtime configuration properties are present in this file and are prefixed with +#'runtime.conf_'. +#A value other than 'embedded' is treated as the name of a properties file from which the runtime configuration +#properties must be picked up. Note that the properties should not be prefixed with 'runtime.conf_' in this case. +runtime.conf.location=embedded + +# Java Security properties +# +# Any configuration prefixed with 'java.security.' will be set on the static instance java.security.Security +# as part of SDC bootstrap process. This will change JVM configuration and should not be used when embedding and running +# multiple SDC instances inside the same JVM. +# +# We're explicitly overriding this to zero as JVM will default to -1 if security manager is active. +java.security.networkaddress.cache.ttl=0 + +# Security Manager +# +# By default when Security Manager is enabled, SDC will use Java one that only follows the specified policy. + +# Enable SDC Security Manager that will always prevent access to SDC's internal directories to all stages (e.g. data dir, +# ...). Please note that there are certain JVM bugs that this manager might hit, especially on some older JVM versions. +#security_manager.sdc_manager.enable=true + +# When Security manager is enabled SDC will by default prohibits access to it's internal directories regardless of what +# the security policy specifies. The following properties allow specific access to given files inside protected directories. +security_manager.sdc_dirs.exceptions=$SDC_CONF/ldap-login.conf + +# General exceptions - use with caution, all stage libraries will be able to access those files. +# * Access to ldap-login.conf is sadly required by Hadoop's UserGroupInfo class +security_manager.sdc_dirs.exceptions=$SDC_CONF/ldap-login.conf + +# Exceptions for specific stage libraries +# * Our documentation recommends default name for credential store inside ETC directory +security_manager.sdc_dirs.exceptions.lib.streamsets-datacollector-jks-credentialstore-lib=$SDC_CONF/jks-credentialStore.pkcs12 + + +# Stage specific configuration(s) +# +# The following config properties are for particular stages, please refer to their documentation for further details. +# +# Hadoop components +# Uncomment to enforce Hadoop components in SDC to always impersonate current user rather then use the impersonation +# configuration option. Current user is a user who either started the pipeline or run preview. +#stage.conf_hadoop.always.impersonate.current.user=true +# Uncomment to enforce impersonated user name to be lower cased. +#stage.conf_hadoop.always.lowercase.user=true +# +# Shell executor +# Controls impersonation mode +#stage.conf_com.streamsets.pipeline.stage.executor.shell.impersonation_mode=CURRENT_USER +# Relative or absolute path to shell that should be used to execute the shell script +#stage.conf_com.streamsets.pipeline.stage.executor.shell.shell=sh +# Relative or absolute path to sudo command +#stage.conf_com.streamsets.pipeline.stage.executor.shell.sudo=sudo + +#Observer related + +#The size of the queueName where the pipeline queues up data rule evaluation requests. +#Each request is for a stream and contains sampled records for all rules that apply to that lane. +observer.queue.size=100 + +#Sampled records which pass evaluation are cached for user to view. This determines the size of the cache and there is +#once cache per data rule +observer.sampled.records.cache.size=100 + +#The time to wait before dropping a data rule evaluation request if the observer queueName is full. +observer.queue.offer.max.wait.time.ms=1000 + + +#Maximum number of private classloaders to allow in the data collector. +#Stage that have configuration singletons (i.e. Hadoop FS & Hbase) require private classloaders +max.stage.private.classloaders=50 + +# Pipeline runner pool +# Default value is sufficient to run 22 pipelines. One pipeline requires 5 Threads and pipelines share +# threads using thread pool. Approximate runner thread pool size = (Number of Running Pipelines) * 2.2. +# Increasing this value will not increase parallelisation of individual pipelines. +runner.thread.pool.size=50 + +# Uncomment to disable starting all previously running pipelines on SDC start up +#runner.boot.pipeline.restart=false + +# Maximal number of runners (multithreaded pipelines) +# +# Maximal number of source-less pipeline instances (=runners) that are allowed for a single multi-threaded +# pipeline. The default is 50. +pipeline.max.runners.count=50 + +# Uncomment to specify a custom location for the package manager repositories. +# Enter a url or comma-separated list of urls. +# Official Data Collector releases use the following repositories by default: +# http://archives.streamsets.com/datacollector//tarball/,http://archives.streamsets.com/datacollector//tarball/enterprise/ +# Data Collector source code builds (master branch) use the following repositories by default: +# http://nightly.streamsets.com/datacollector/latest/tarball/,http://nightly.streamsets.com/datacollector/latest/tarball/enterprise/ +#package.manager.repository.links= + +# Support bundles +# +# Uncomment if you need to disable the facility for automatic support bundle upload. +#bundle.upload.enabled=false +# +# Uncomment to automatically generate and upload bundle on various errors. Enable with caution, uploading bundle +# can be time consuming task (depending on size and internet speed) and pipelines can appear "frozen" during +# the upload especially when many pipelines are failing at the same time. +#bundle.upload.on_error=true + +# Library aliases mapping to keep backward compatibility on pipelines when library names change +# The current aliasing mapping is to handle 1.0.0beta2 to 1.0.0 library names changes +# +# IMPORTANT: Under normal circumstances all these properties should not be changed +# +library.alias.streamsets-datacollector-apache-kafka_0_8_1_1-lib=streamsets-datacollector-apache-kafka_0_8_1-lib +library.alias.streamsets-datacollector-apache-kafka_0_8_2_0-lib=streamsets-datacollector-apache-kafka_0_8_2-lib +library.alias.streamsets-datacollector-apache-kafka_0_8_2_1-lib=streamsets-datacollector-apache-kafka_0_8_2-lib +library.alias.streamsets-datacollector-cassandra_2_1_5-lib=streamsets-datacollector-cassandra_2-lib +library.alias.streamsets-datacollector-cdh5_2_1-lib=streamsets-datacollector-cdh_5_2-lib +library.alias.streamsets-datacollector-cdh5_2_3-lib=streamsets-datacollector-cdh_5_2-lib +library.alias.streamsets-datacollector-cdh5_2_4-lib=streamsets-datacollector-cdh_5_2-lib +library.alias.streamsets-datacollector-cdh5_3_0-lib=streamsets-datacollector-cdh_5_3-lib +library.alias.streamsets-datacollector-cdh5_3_1-lib=streamsets-datacollector-cdh_5_3-lib +library.alias.streamsets-datacollector-cdh5_3_2-lib=streamsets-datacollector-cdh_5_3-lib +library.alias.streamsets-datacollector-cdh5_4_0-cluster-cdh_kafka_1_2_0-lib=streamsets-datacollector-cdh_5_4-cluster-cdh_kafka_1_2-lib +library.alias.streamsets-datacollector-cdh5_4_0-lib=streamsets-datacollector-cdh_5_4-lib +library.alias.streamsets-datacollector-cdh5_4_1-cluster-cdh_kafka_1_2_0-lib=streamsets-datacollector-cdh_5_4-cluster-cdh_kafka_1_2-lib +library.alias.streamsets-datacollector-cdh5_4_1-lib=streamsets-datacollector-cdh_5_4-lib +library.alias.streamsets-datacollector-cdh_5_4-cluster-cdh_kafka_1_2_0-lib=streamsets-datacollector-cdh_5_4-cluster-cdh_kafka_1_2-lib +library.alias.streamsets-datacollector-cdh_kafka_1_2_0-lib=streamsets-datacollector-cdh_kafka_1_2-lib +library.alias.streamsets-datacollector-elasticsearch_1_4_4-lib=streamsets-datacollector-elasticsearch_1_4-lib +library.alias.streamsets-datacollector-elasticsearch_1_5_0-lib=streamsets-datacollector-elasticsearch_1_5-lib +library.alias.streamsets-datacollector-hdp_2_2_0-lib=streamsets-datacollector-hdp_2_2-lib +library.alias.streamsets-datacollector-jython_2_7_0-lib=streamsets-datacollector-jython_2_7-lib +library.alias.streamsets-datacollector-mongodb_3_0_2-lib=streamsets-datacollector-mongodb_3-lib +library.alias.streamsets-datacollector-cassandra_2-lib=streamsets-datacollector-cassandra_3-lib +library.alias.streamsets-datacollector-cdh_5_9-cluster-cdh_kafka_2_0-lib=streamsets-datacollector-cdh-spark_2_1-lib +library.alias.streamsets-datacollector-cdh_5_10-cluster-cdh_kafka_2_1-lib=streamsets-datacollector-cdh-spark_2_1-lib +library.alias.streamsets-datacollector-cdh_5_11-cluster-cdh_kafka_2_1-lib=streamsets-datacollector-cdh-spark_2_1-lib +library.alias.streamsets-datacollector-cdh_5_12-cluster-cdh_kafka_2_1-lib=streamsets-datacollector-cdh-spark_2_1-lib +library.alias.streamsets-datacollector-cdh_5_13-cluster-cdh_kafka_2_1-lib=streamsets-datacollector-cdh-spark_2_1-lib +library.alias.streamsets-datacollector-cdh_5_14-cluster-cdh_kafka_2_1-lib=streamsets-datacollector-cdh-spark_2_1-lib + + +# Stage aliases for mapping to keep backward compatibility on pipelines when stages move libraries +# The current alias mapping is to handle moving the jdbc stages to their own library +# +# IMPORTANT: Under normal circumstances all these properties should not be changed +# +stage.alias.streamsets-datacollector-basic-lib,com_streamsets_pipeline_stage_destination_jdbc_JdbcDTarget=streamsets-datacollector-jdbc-lib,com_streamsets_pipeline_stage_destination_jdbc_JdbcDTarget +stage.alias.streamsets-datacollector-basic-lib,com_streamsets_pipeline_stage_origin_jdbc_JdbcDSource=streamsets-datacollector-jdbc-lib,com_streamsets_pipeline_stage_origin_jdbc_JdbcDSource +stage.alias.streamsets-datacollector-basic-lib,com_streamsets_pipeline_stage_origin_omniture_OmnitureDSource=streamsets-datacollector-omniture-lib,com_streamsets_pipeline_stage_origin_omniture_OmnitureDSource +stage.alias.streamsets-datacollector-cdh_5_7-cluster-cdh_kafka_2_0-lib,com_streamsets_pipeline_stage_destination_kafka_KafkaDTarget=streamsets-datacollector-cdh_kafka_2_0-lib,com_streamsets_pipeline_stage_destination_kafka_KafkaDTarget +stage.alias.streamsets-datacollector-elasticsearch_1_4-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_1_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_1_6-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_1_7-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_2_0-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_2_1-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_2_2-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_2_3-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_2_4-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_5_0-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_1_4-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_1_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_1_6-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_1_7-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_2_0-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_2_1-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_2_2-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_2_3-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_2_4-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget +stage.alias.streamsets-datacollector-elasticsearch_5_0-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget=streamsets-datacollector-elasticsearch_5-lib,com_streamsets_pipeline_stage_destination_elasticsearch_ToErrorElasticSearchDTarget +stage.alias.streamsets-datacollector-cdh_5_4-lib,com_streamsets_pipeline_stage_processor_spark_StandaloneSparkDProcessor=streamsets-datacollector-cdh_spark_2_1_r1-lib,com_streamsets_pipeline_stage_processor_spark_SparkDProcessor +stage.alias.streamsets-datacollector-cdh_5_5-lib,com_streamsets_pipeline_stage_processor_spark_StandaloneSparkDProcessor=streamsets-datacollector-cdh_spark_2_1_r1-lib,com_streamsets_pipeline_stage_processor_spark_SparkDProcessor +stage.alias.streamsets-datacollector-cdh_5_7-lib,com_streamsets_pipeline_stage_processor_spark_StandaloneSparkDProcessor=streamsets-datacollector-cdh_spark_2_1_r1-lib,com_streamsets_pipeline_stage_processor_spark_SparkDProcessor +stage.alias.streamsets-datacollector-cdh_5_8-lib,com_streamsets_pipeline_stage_processor_spark_StandaloneSparkDProcessor=streamsets-datacollector-cdh_spark_2_1_r1-lib,com_streamsets_pipeline_stage_processor_spark_SparkDProcessor +stage.alias.streamsets-datacollector-cdh_5_9-lib,com_streamsets_pipeline_stage_processor_spark_StandaloneSparkDProcessor=streamsets-datacollector-cdh_spark_2_1_r1-lib,com_streamsets_pipeline_stage_processor_spark_SparkDProcessor +stage.alias.streamsets-datacollector-cdh_5_10-lib,com_streamsets_pipeline_stage_processor_spark_StandaloneSparkDProcessor=streamsets-datacollector-cdh_spark_2_1_r1-lib,com_streamsets_pipeline_stage_processor_spark_SparkDProcessor +stage.alias.streamsets-datacollector-aws-lib,com_streamsets_pipeline_stage_destination_kinesis_FirehoseDTarget=streamsets-datacollector-kinesis-lib,com_streamsets_pipeline_stage_destination_kinesis_FirehoseDTarget +stage.alias.streamsets-datacollector-aws-lib,com_streamsets_pipeline_stage_destination_kinesis_StatsKinesisDTarget=streamsets-datacollector-kinesis-lib,com_streamsets_pipeline_stage_destination_kinesis_StatsKinesisDTarget +stage.alias.streamsets-datacollector-aws-lib,com_streamsets_pipeline_stage_destination_kinesis_KinesisDTarget=streamsets-datacollector-kinesis-lib,com_streamsets_pipeline_stage_destination_kinesis_KinesisDTarget +stage.alias.streamsets-datacollector-aws-lib,com_streamsets_pipeline_stage_destination_kinesis_ToErrorKinesisDTarget=streamsets-datacollector-kinesis-lib,com_streamsets_pipeline_stage_destination_kinesis_ToErrorKinesisDTarget +stage.alias.streamsets-datacollector-aws-lib,com_streamsets_pipeline_stage_origin_kinesis_KinesisDSource=streamsets-datacollector-kinesis-lib,com_streamsets_pipeline_stage_origin_kinesis_KinesisDSource +stage.alias.streamsets-datacollector-hdp_2_3-lib,com_streamsets_pipeline_stage_processor_hive_HiveMetadataDProcessor=streamsets-datacollector-hdp_2_3-hive1-lib,com_streamsets_pipeline_stage_processor_hive_HiveMetadataDProcessor +stage.alias.streamsets-datacollector-hdp_2_3-lib,com_streamsets_pipeline_stage_destination_hive_HiveMetastoreDTarget=streamsets-datacollector-hdp_2_3-hive1-lib,com_streamsets_pipeline_stage_destination_hive_HiveMetastoreDTarget +stage.alias.streamsets-datacollector-hdp_2_3-lib,com_streamsets_pipeline_stage_destination_hive_HiveDTarget=streamsets-datacollector-hdp_2_3-hive1-lib,com_streamsets_pipeline_stage_destination_hive_HiveDTarget +stage.alias.streamsets-datacollector-hdp_2_4-lib,com_streamsets_pipeline_stage_processor_hive_HiveMetadataDProcessor=streamsets-datacollector-hdp_2_4-hive1-lib,com_streamsets_pipeline_stage_processor_hive_HiveMetadataDProcessor +stage.alias.streamsets-datacollector-hdp_2_4-lib,com_streamsets_pipeline_stage_destination_hive_HiveMetastoreDTarget=streamsets-datacollector-hdp_2_4-hive1-lib,com_streamsets_pipeline_stage_destination_hive_HiveMetastoreDTarget +stage.alias.streamsets-datacollector-hdp_2_4-lib,com_streamsets_pipeline_stage_destination_hive_HiveDTarget=streamsets-datacollector-hdp_2_4-hive1-lib,com_streamsets_pipeline_stage_destination_hive_HiveDTarget + + +# System and user stage libraries whitelists and blacklists +# +# If commented out all stagelibraries directories are used. +# +# Given 'system' or 'user', only whitelist or blacklist can be set, if both are set the Data Collector will fail to start +# +# Specify stage library directories separated by commas +# +# The MapR stage libraries are disabled as they require manual installation step. Use setup-mapr script to enable +# the desired MapR stage library. +# +# It's important to keep the blacklist and whitelist properties on a single line, otherwise CSD's control.sh script and +# setup-mapr script will not work properly. +# +#system.stagelibs.whitelist= +system.stagelibs.blacklist=streamsets-datacollector-mapr_5_0-lib,streamsets-datacollector-mapr_5_1-lib,streamsets-datacollector-mapr_5_2-lib,streamsets-datacollector-mapr_6_0-lib,streamsets-datacollector-mapr_6_0-mep4-lib,streamsets-datacollector-mapr_6_0-mep5-lib,streamsets-datacollector-mapr_6_1-lib,streamsets-datacollector-mapr_6_1-mep6-lib,streamsets-datacollector-mapr_spark_2_1_mep_3_0-lib +# +#user.stagelibs.whitelist= +#user.stagelibs.blacklist= + +# Stage Classpath Validation +# +# Uncomment to disable best effort validation of each stage library classpath to detect known issues with +# colliding dependencies (such as conflicting versions of the same dependency, ...). Result of the validation +# is by default only printed to log. +#stagelibs.classpath.validation.enable=false +# +# By default the validation result is only logged. Uncomment to prevent SDC to start if classpath of any +# stage library is not considered valid. +#stagelibs.classpath.validation.terminate=true + +# +# Additional Configuration files to include in to the configuration. +# Value of this property is the name of the configuration file separated by commas. +# +config.includes=dpm.properties,vault.properties,credential-stores.properties + + +# +# Record Sampling configurations indicate the size of the subset (sample set) that must be chosen from a population (of records). +# Default configuration values indicate the sampler to select 1 out of 10000 records +# +# For better performance simplify the fraction ( sdc.record.sampling.sample.size / sdc.record.sampling.population.size ) +# i.e., specify ( 1 / 40 ) instead of ( 250 / 10000 ). +sdc.record.sampling.sample.size=1 +sdc.record.sampling.population.size=10000 + +# +# Pipeline State are cached for faster access. +# Specifies the maximum number of pipeline state entries the cache may contain. +store.pipeline.state.cache.maximum.size=100 + +# Specifies that each pipeline state entry should be automatically removed from the cache once a fixed duration +# has elapsed after the entry's creation, the most recent replacement of its value, or its last access. +# In minutes +store.pipeline.state.cache.expire.after.access=10 diff --git a/sdc/docker-compose.yml b/sdc/docker-compose.yml new file mode 100644 index 0000000..232155b --- /dev/null +++ b/sdc/docker-compose.yml @@ -0,0 +1,8 @@ +sdc: + image: streamsets/datacollector + ports: + - "18630:18630" + volumes: + - ./data:/data + - ./data/sdc.properties:/etc/sdc/sdc.properties + restart: always