diff --git a/README.md b/README.md index 1406cdd..543423d 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Overview of all OpenML components including a docker-compose to run OpenML servi ![OpenML Component overview](https://raw.githubusercontent.com/openml/services/main/documentation/OpenML-overview.png) ## Prerequisites -- Linux/MacOS with Intell processor (because of our old ES version, this project currently does not support `arm` architectures) +- Linux/MacOS (For Mac with `arm` architectures, enable Rosetta for emulation. QEMU and Docker VMM do not work with the elastic search image) - [Docker](https://docs.docker.com/get-docker/) - [Docker Compose](https://docs.docker.com/compose/install/) version 2.21.0 or higher diff --git a/config/arff-to-pq-converter/Dockerfile b/config/arff-to-pq-converter/Dockerfile index d164e90..58cc6b7 100644 --- a/config/arff-to-pq-converter/Dockerfile +++ b/config/arff-to-pq-converter/Dockerfile @@ -8,4 +8,4 @@ RUN apt update && apt upgrade -y RUN apt -y install cron RUN chmod +x /etc/cron.d/openml -RUN crontab -u unprivileged-user /etc/cron.d/openml \ No newline at end of file +RUN crontab -u unprivileged-user /etc/cron.d/openml diff --git a/config/arff-to-pq-converter/config b/config/arff-to-pq-converter/config index 9e32e2f..acbc67d 100644 --- a/config/arff-to-pq-converter/config +++ b/config/arff-to-pq-converter/config @@ -1 +1 @@ -server=http://nginx:80/api/v1/xml +server=http://nginx:8000/api/v1/xml diff --git a/config/database/update.sh b/config/database/update.sh index 0239211..8ef5a75 100755 --- a/config/database/update.sh +++ b/config/database/update.sh @@ -2,30 +2,7 @@ # Change the filepath of openml.file # from "https://www.openml.org/data/download/1666876/phpFsFYVN" # to "http://minio:9000/datasets/0000/0001/phpFsFYVN" -mysql -hdatabase -uroot -pok -e 'UPDATE openml.file SET filepath = CONCAT("http://minio:9000/datasets/0000/", LPAD(id, 4, "0"), "/", SUBSTRING_INDEX(filepath, "/", -1)) WHERE extension="arff";' +mysql -hdatabase -uroot -pok -e 'UPDATE openml.file SET filepath = CONCAT("http://localhost:8000/minio/datasets/0000/", LPAD(id, 4, "0"), "/", SUBSTRING_INDEX(filepath, "/", -1)) WHERE extension="arff";' # Update openml.expdb.dataset with the same url mysql -hdatabase -uroot -pok -e 'UPDATE openml_expdb.dataset DS, openml.file FL SET DS.url = FL.filepath WHERE DS.did = FL.id;' - - - - - -# Create the data_feature_description TABLE. TODO: can we make sure this table exists already? -mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'CREATE TABLE IF NOT EXISTS `data_feature_description` ( - `did` int unsigned NOT NULL, - `index` int unsigned NOT NULL, - `uploader` mediumint unsigned NOT NULL, - `date` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - `description_type` enum("plain", "ontology") NOT NULL, - `value` varchar(256) NOT NULL, - KEY `did` (`did`,`index`), - CONSTRAINT `data_feature_description_ibfk_1` FOREIGN KEY (`did`, `index`) REFERENCES `data_feature` (`did`, `index`) ON DELETE CASCADE ON UPDATE CASCADE -)' - -# SET dataset 1 to active (used in unittests java) -mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'INSERT IGNORE INTO dataset_status VALUES (1, "active", "2024-01-01 00:00:00", 1)' -mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'DELETE FROM dataset_status WHERE did = 2 AND status = "deactivated";' - -# Temporary fix in case the database missed the kaggle table. The PHP Rest API expects the table to be there, while indexing. -mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'CREATE TABLE IF NOT EXISTS `kaggle` (`dataset_id` int(11) DEFAULT NULL, `kaggle_link` varchar(500) DEFAULT NULL)' \ No newline at end of file diff --git a/config/email-server/imapsql.db b/config/email-server/imapsql.db index 95f5d19..de1f397 100644 Binary files a/config/email-server/imapsql.db and b/config/email-server/imapsql.db differ diff --git a/config/evaluation-engine/.env b/config/evaluation-engine/.env index 882ba22..17480f2 100644 --- a/config/evaluation-engine/.env +++ b/config/evaluation-engine/.env @@ -1,4 +1,4 @@ -CONFIG=api_key=AD000000000000000000000000000000;server=http://php-api:80/ +CONFIG=api_key=abc;server=http://php-api:80/ JAVA=/usr/bin/java JAR=/usr/local/lib/evaluation-engine.jar -LOG_DIR=/logs \ No newline at end of file +LOG_DIR=/logs diff --git a/config/evaluation-engine/run-cron.sh b/config/evaluation-engine/run-cron.sh index cff7a2f..5c1b03d 100755 --- a/config/evaluation-engine/run-cron.sh +++ b/config/evaluation-engine/run-cron.sh @@ -1,6 +1,16 @@ #!/bin/sh +# We need to remove the default 127.0.0.1 localhost map to +# ensure the remap to the static nginx ip address is respected. +# Updating /etc/hosts in place isn't always allowed ("Resource Busy"), +# directly overwriting it instead seems to bypass that protection. +cp /etc/hosts /etc/hosts.new +sed -i '/^127.0.0.1.*localhost/d' /etc/hosts.new +sed -i -E 's/^(::1\t)localhost (.*)$/\1\2/g' /etc/hosts.new +cat /etc/hosts.new > /etc/hosts +rm /etc/hosts.new + printenv | grep -v HOME >> /etc/environment touch /cron.log -/usr/sbin/crond -l 4 && tail -f /cron.log \ No newline at end of file +/usr/sbin/crond -l 4 && tail -f /cron.log diff --git a/config/frontend/.env b/config/frontend/.env index 128bbab..b33aeba 100644 --- a/config/frontend/.env +++ b/config/frontend/.env @@ -9,7 +9,7 @@ DATABASE_URI="mysql+pymysql://root:ok@database:3306/openml" TESTING=False REACT_APP_URL_SITE_BACKEND=http://localhost:8000/ -REACT_APP_URL_API=http://localhost:8000/api/ +REACT_APP_URL_API=http://localhost:8000/ REACT_APP_URL_ELASTICSEARCH=http://localhost:8000/es/ REACT_APP_ELASTICSEARCH_VERSION_MAYOR=6 REACT_APP_URL_MINIO=http://localhost:8000/data/ diff --git a/config/nginx/Dockerfile b/config/nginx/Dockerfile index 7498879..7a7ab50 100644 --- a/config/nginx/Dockerfile +++ b/config/nginx/Dockerfile @@ -3,6 +3,6 @@ FROM nginx:alpine WORKDIR /etc/nginx COPY ./nginx.conf ./conf.d/default.conf COPY ./shared.conf ./shared.conf -EXPOSE 80 +EXPOSE 8000 ENTRYPOINT [ "nginx" ] -CMD [ "-g", "daemon off;" ] \ No newline at end of file +CMD [ "-g", "daemon off;" ] diff --git a/config/nginx/nginx.conf b/config/nginx/nginx.conf index 5316b1f..7dcd3d7 100644 --- a/config/nginx/nginx.conf +++ b/config/nginx/nginx.conf @@ -3,7 +3,7 @@ server { - listen 80; + listen 8000; server_name localhost; resolver 127.0.0.11; @@ -48,4 +48,4 @@ server { set $upstream_f http://frontend:5000; proxy_pass $upstream_f/$1$is_args; } -} \ No newline at end of file +} diff --git a/config/php/.env b/config/php/.env index 3a641c8..77c1058 100644 --- a/config/php/.env +++ b/config/php/.env @@ -1,6 +1,6 @@ API_KEY=AD000000000000000000000000000000 -BASE_URL=http://php-api:80/ -MINIO_URL=http://minio:9000/ +BASE_URL=http://localhost:8000/ +MINIO_URL=http://localhost:8000/minio/ DB_HOST_OPENML=database:3306 DB_HOST_EXPDB=database:3306 DB_USER_OPENML=root @@ -10,4 +10,5 @@ DB_PASS_EXPDB_WRITE=ok DB_USER_EXPDB_READ=root DB_PASS_EXPDB_READ=ok ES_URL=elasticsearch:9200 -ES_PASSWORD=default \ No newline at end of file +ES_PASSWORD=default +INDEX_ES_DURING_STARTUP=true diff --git a/config/python/config b/config/python/config index a64d014..62266bf 100644 --- a/config/python/config +++ b/config/python/config @@ -1,2 +1,2 @@ -apikey=AD000000000000000000000000000000 -server=http://nginx:80/api/v1/xml \ No newline at end of file +apikey=normaluser +server=http://localhost:8000/api/v1/xml diff --git a/docker-compose.yaml b/docker-compose.yaml index e4e9507..d4c0831 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,6 +1,6 @@ services: database: - image: "openml/test-database:20240105" + image: "openml/test-database:v0.1.20260204" container_name: "openml-test-database" environment: MYSQL_ROOT_PASSWORD: ok @@ -54,11 +54,15 @@ services: context: config/nginx container_name: openml-nginx ports: - - "8000:80" + - "8000:8000" + networks: + default: + ipv4_address: 172.28.0.2 + php-api: profiles: ["all", "minio", "rest-api", "frontend", "evaluation-engine"] - image: openml/php-rest-api:v1.2.1 + image: openml/php-rest-api:v1.2.4 container_name: "openml-php-rest-api" ports: - "8080:80" # also known as /api (nginx) @@ -78,6 +82,8 @@ services: start_interval: 5s timeout: 3s interval: 1m + extra_hosts: + - "localhost=172.28.0.2" email-server: profiles: ["all", "frontend"] @@ -95,7 +101,7 @@ services: frontend: profiles: ["all", "frontend"] - image: openml/frontend:dev_v2.0.20251111 + image: openml/frontend:v2.1.1 container_name: "openml-frontend" ports: - "8081:5000" # also known as / (nginx) @@ -108,7 +114,7 @@ services: minio: profiles: ["all", "minio", "evaluation-engine"] - image: openml/test-minio:v0.1.20241110 + image: openml/test-minio:v0.1.20260204 container_name: "openml-minio" ports: - "9000:9000" # also known as /data (nginx) @@ -133,6 +139,8 @@ services: depends_on: php-api: condition: service_healthy + extra_hosts: + - "localhost=172.28.0.2" croissants: profiles: ["all"] @@ -157,7 +165,15 @@ services: depends_on: php-api: condition: service_healthy + extra_hosts: + - "localhost=172.28.0.2" networks: default: name: openml-services + ipam: + driver: default + config: + - subnet: 172.28.0.0/16 + ip_range: 172.28.1.0/24 + diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..8b2b1fc --- /dev/null +++ b/test.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# This test assumes services are running locally: +# `docker compose --profile all up -d` +# +# It tests some of the most important services, but is by no means comprehensive. +# In particular, also at least check the frontpage in a browser (http://localhost:8000). + +set -e + +assert_contains() { + if echo "$1" | grep --ignore-case -q "$2"; then + echo "PASS: output contains '$2'" + else + echo "FAIL: output does not contain '$2'" + echo "Full output:" + echo "$1" + exit 1 + fi +} + +assert_url_exists() { + if curl --output /dev/null --silent --head --fail --location "$1"; then + echo "PASS: $1 exists" + else + echo "FAIL: $1 does not exist" + exit 1 + fi +} + +# nginx redirects request to the home page +HOME_PAGE=$(curl -s http://localhost:8000) +assert_contains "$HOME_PAGE" "OpenML is an open platform for sharing datasets" + +DATASET_URL=http://localhost:8000/minio/datasets/0000/0020/dataset_37_diabetes.arff +DESCRIPTION_URL=http://localhost:8000/api/v1/json/data/20 + +# The JSON response may contain escaped slashes (e.g. http:\/\/), so strip them +DESCRIPTION=$(curl -s "$DESCRIPTION_URL" | sed 's/\\//g') +assert_contains "$DESCRIPTION" "diabetes" + +wget "$DATASET_URL" -O dataset.arff +assert_contains "$(cat dataset.arff)" "@data" +rm dataset.arff + +if [ -d .venv ]; then + echo "Using existing virtual environment for dataset upload." +else + echo "Creating virtual environment for dataset upload." + python -m venv .venv + source .venv/bin/activate + python -m pip install uv + uv pip install openml numpy +fi + +echo "Attempting dataset upload" + +DATA_ID=$(.venv/bin/python -c " +import numpy as np +import openml +from openml.datasets import create_dataset + +openml.config.server = 'http://localhost:8000/api/v1/xml' +openml.config.apikey = 'normaluser' + +data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T +attributes = [('col_' + str(i), 'REAL') for i in range(data.shape[1])] + +dataset = create_dataset( + name='test-data', + description='Synthetic dataset created from a NumPy array', + creator='OpenML tester', + contributor=None, + collection_date='01-01-2018', + language='English', + licence='MIT', + default_target_attribute='col_' + str(data.shape[1] - 1), + row_id_attribute=None, + ignore_attribute=None, + citation='None', + attributes=attributes, + data=data, + version_label='test', + original_data_url='http://openml.github.io/openml-python', + paper_url='http://openml.github.io/openml-python', +) +dataset.publish() +print(dataset.id) +") + +# Make sure DATA_ID is an integer, and not some Python error output +if ! echo "$DATA_ID" | grep -q '^[0-9]\+$'; then + echo "FAIL: DATA_ID is not an integer: '$DATA_ID'" + exit 1 +fi + +NEW_DATASET_URL=$(curl -s http://localhost:8000/api/v1/json/data/169 | jq -r ".data_set_description.url") +assert_url_exists "$NEW_DATASET_URL" +wget "$NEW_DATASET_URL" -O new_dataset.arff +assert_contains "$(cat new_dataset.arff)" "@data" +rm new_dataset.arff + +# Wait for the dataset to become active, polling every 10 seconds for up to 2 minutes +WAITED=0 +while [ "$WAITED" -lt 120 ]; do + DATASET_STATUS=$(curl -s "http://localhost:8000/api/v1/json/data/${DATA_ID}") + if echo "$DATASET_STATUS" | grep -q "active"; then + echo "PASS: dataset $DATA_ID is active (after ${WAITED}s)" + break + fi + echo "Waiting for dataset $DATA_ID to become active... (${WAITED}s elapsed)" + sleep 10 + WAITED=$((WAITED + 10)) +done + +if [ "$WAITED" -ge 120 ]; then + echo "FAIL: dataset $DATA_ID did not become active within 120s" + echo "Full output:" + echo "$DATASET_STATUS" + exit 1 +fi + +echo "Checking parquet conversion" +PADDED_ID=$(printf "%04d" "$DATA_ID") +NEW_PARQUET_URL="http://localhost:8000/minio/datasets/0000/${PADDED_ID}/dataset_${DATA_ID}.pq" +wget "$NEW_PARQUET_URL" +DATA_SHAPE=$(.venv/bin/python -c "import pandas as pd; df = pd.read_parquet(\"dataset_${DATA_ID}.pq\"); print(df.shape)") +assert_contains "${DATA_SHAPE}" "(3, 4)" +rm "dataset_${DATA_ID}.pq" + +CROISSANT_URL="http://localhost:8000/croissant/dataset/${DATA_ID}" +CROISSANT_NAME=$(curl -s ${CROISSANT_URL} | jq -r ".name") +assert_contains ${CROISSANT_NAME} "test-data" + +ES_RESPONSE=$(curl -s "http://localhost:8000/es/data/_doc/${DATA_ID}") +assert_contains "${ES_RESPONSE}" "test-data"