-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
92 lines (79 loc) · 1.92 KB
/
docker-compose.yml
File metadata and controls
92 lines (79 loc) · 1.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
version: '3'
services:
# Database for data storage
postgres:
image: postgres:13
environment:
POSTGRES_USER: dataflowx
POSTGRES_PASSWORD: dataflowx
POSTGRES_DB: dataflowx
ports:
- "5432:5432"
volumes:
- postgres_data:/var/lib/postgresql/data
# Kafka for data streaming
zookeeper:
image: wurstmeister/zookeeper
ports:
- "2181:2181"
kafka:
image: wurstmeister/kafka
ports:
- "9092:9092"
environment:
KAFKA_ADVERTISED_HOST_NAME: localhost
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_CREATE_TOPICS: "ecommerce-stream:1:1,processed-events:1:1"
KAFKA_BOOTSTRAP_SERVERS: "localhost:9092"
KAFKA_CONSUMER_GROUP: "dataflowx-group"
KAFKA_AUTO_OFFSET_RESET: "earliest"
depends_on:
- zookeeper
# Redis for caching
redis:
image: redis:6
ports:
- "6379:6379"
# Spark for data processing
spark-master:
image: bitnami/spark:3
environment:
- SPARK_MODE=master
ports:
- "8080:8080"
- "7077:7077"
spark-worker:
image: bitnami/spark:3
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
depends_on:
- spark-master
# Jupyter for interactive analysis
jupyter:
image: jupyter/pyspark-notebook
ports:
- "8888:8888"
volumes:
- ./notebooks:/home/jovyan/work
environment:
- JUPYTER_ENABLE_LAB=yes
# MinIO for S3-compatible storage
minio:
image: minio/minio
ports:
- "9000:9000"
- "9001:9001"
environment:
MINIO_ROOT_USER: minio
MINIO_ROOT_PASSWORD: minio123
command: server /data --console-address ":9001"
volumes:
- minio_data:/data
volumes:
postgres_data:
minio_data:
# DataFlowX
A streamlined data pipeline for ingestion, processing, and streaming with visualization.
## Architecture
DataFlowX implements a modern data pipeline architecture: