Skip to content

Update build and docker image to fetch models from huggingface #44

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions .github/workflows/ci-build-manual.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
name: Build and push a development version on docker

on:
workflow_dispatch:
inputs:
custom_tag:
type: string
description: Docker image tag
required: true
default: "latest-develop"

jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up JDK 17
uses: actions/setup-java@v4
with:
java-version: '17.0.10+7'
distribution: 'temurin'
cache: 'gradle'
- name: Build with Gradle
run: ./gradlew build -x test

docker-build:
needs: [ build ]
runs-on: ubuntu-latest

steps:
- name: Create more disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /opt/hostedtoolcache
sudo rm -rf /opt/google/chrome
sudo rm -rf /opt/microsoft/msedge
sudo rm -rf /opt/microsoft/powershell
sudo rm -rf /opt/pipx
sudo rm -rf /usr/lib/mono
sudo rm -rf /usr/local/julia*
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/lib/node_modules
sudo rm -rf /usr/local/share/chromium
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/share/swift
- uses: actions/checkout@v4
- name: Build and push
id: docker_build
uses: mr-smithers-excellent/docker-build-push@v6
with:
dockerfile: Dockerfile.software
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
image: lfoppiano/software-mentions
registry: docker.io
pushImage: true
tags: |
latest-develop, ${{ github.event.inputs.custom_tag}}
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
71 changes: 71 additions & 0 deletions .github/workflows/ci-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: Build unstable

on: [push]

concurrency:
group: gradle
# cancel-in-progress: true


jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up JDK 17
uses: actions/setup-java@v4
with:
java-version: '17.0.10+7'
distribution: 'temurin'
cache: 'gradle'
- name: Build with Gradle
run: ./gradlew build -x test

# - name: Test with Gradle Jacoco and Coveralls
# run: ./gradlew test jacocoTestReport coveralls --no-daemon
#
# - name: Coveralls GitHub Action
# uses: coverallsapp/github-action@v2
# with:
# github-token: ${{ secrets.GITHUB_TOKEN }}
# format: jacoco

docker-build:
needs: [ build ]
runs-on: ubuntu-latest

steps:
- name: Create more disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf /opt/hostedtoolcache
sudo rm -rf /opt/google/chrome
sudo rm -rf /opt/microsoft/msedge
sudo rm -rf /opt/microsoft/powershell
sudo rm -rf /opt/pipx
sudo rm -rf /usr/lib/mono
sudo rm -rf /usr/local/julia*
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/lib/node_modules
sudo rm -rf /usr/local/share/chromium
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/share/swift
- uses: actions/checkout@v4
- name: Build and push
id: docker_build
uses: mr-smithers-excellent/docker-build-push@v6
with:
dockerfile: Dockerfile.software
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
image: lfoppiano/software-mentions
registry: docker.io
pushImage: ${{ github.event_name != 'pull_request' }}
tags: latest-develop
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
171 changes: 56 additions & 115 deletions Dockerfile.software
Original file line number Diff line number Diff line change
Expand Up @@ -7,146 +7,87 @@ FROM openjdk:17-jdk-slim as builder
USER root

RUN apt-get update && \
apt-get -y --no-install-recommends install unzip
apt-get -y --no-install-recommends install apt-utils libxml2 git-lfs unzip wget

WORKDIR /opt/grobid-source

# gradle
COPY gradle/ ./gradle/
COPY gradlew ./
COPY gradle.properties ./
COPY build.gradle ./
COPY settings.gradle ./

# source
COPY software-mentions/ ./software-mentions/
COPY grobid-home/ ./grobid-home/
COPY grobid-core/ ./grobid-core/
COPY grobid-service/ ./grobid-service/
COPY grobid-trainer/ ./grobid-trainer/

# cleaning unused native libraries before packaging
RUN rm -rf grobid-home/pdf2xml
RUN rm -rf grobid-home/pdfalto/lin-32
RUN rm -rf grobid-home/pdfalto/mac-64
RUN rm -rf grobid-home/pdfalto/win-*
RUN rm -rf grobid-home/lib/lin-32
RUN rm -rf grobid-home/lib/win-*
RUN rm -rf grobid-home/lib/mac-64

# cleaning unused datasets stuff
RUN rm -rf software-mentions/resources/dataset
WORKDIR /opt/grobid

RUN ./gradlew clean assemble install --no-daemon --info --stacktrace
RUN mkdir -p software-mentions-source grobid-home/models
COPY src software-mentions-source/src
COPY settings.gradle software-mentions-source/
COPY resources/config/config-docker.yml software-mentions-source/resources/config/config.yml
COPY resources/models software-mentions-source/resources/models
COPY resources/lexicon software-mentions-source/resources/lexicon
COPY build.gradle software-mentions-source/
COPY gradle software-mentions-source/gradle/
COPY gradlew software-mentions-source/
#COPY .git software-mentions-source/.git
COPY localLibs software-mentions-source/localLibs

# Preparing models
WORKDIR /opt/grobid/software-mentions-source
RUN rm -rf /opt/grobid/grobid-home/models/*
RUN ./gradlew clean assemble -x shadowJar --no-daemon --stacktrace --info
RUN git lfs install
RUN ./gradlew installModels --no-daemon --info --stacktrace \
&& rm -f /opt/grobid/grobid-home/models/*.zip

# Preparing distribution
WORKDIR /opt/grobid
RUN rm /opt/grobid/software-mentions-source/build/distributions/software-mentions-shadow* \
&& unzip -o /opt/grobid/software-mentions-source/build/distributions/software-mentions-*.zip -d software-mentions_distribution \
&& mv software-mentions_distribution/software-mentions-* software-mentions \
&& rm -rf /opt/grobid/software-mentions-source/build

WORKDIR ./software-mentions/
RUN ./gradlew clean install --no-daemon --info --stacktrace
# install Pub2TEI
WORKDIR /opt/
RUN wget https://github.com/kermitt2/Pub2TEI/archive/refs/heads/master.zip && \
unzip master.zip && \
mv Pub2TEI-master Pub2TEI && \
rm master.zip

WORKDIR /opt/grobid
RUN unzip -o /opt/grobid-source/grobid-service/build/distributions/grobid-service-*.zip && \
mv grobid-service* grobid-service
RUN unzip -o /opt/grobid-source/grobid-home/build/distributions/grobid-home-*.zip && \
chmod -R 755 /opt/grobid/grobid-home/pdfalto
RUN rm -rf grobid-source

# -------------------
# build runtime image
# -------------------

# use NVIDIA Container Toolkit to automatically recognize possible GPU drivers on the host machine
FROM tensorflow/tensorflow:2.7.0-gpu
FROM lfoppiano/grobid:0.8.1-full as runtime

# setting locale is likely useless but to be sure
ENV LANG C.UTF-8

# update NVIDIA Cuda key (following a key rotation in April 2022)
RUN apt-get install -y wget
RUN apt-key del 7fa2af80
RUN rm /etc/apt/sources.list.d/cuda.list
RUN rm /etc/apt/sources.list.d/nvidia-ml.list
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
RUN dpkg -i cuda-keyring_1.0-1_all.deb

# install JRE, python and other dependencies
RUN apt-get update && \
apt-get -y --no-install-recommends install apt-utils build-essential gcc libxml2 libfontconfig unzip curl \
openjdk-17-jre-headless openjdk-17-jdk ca-certificates-java \
musl gfortran \
python3 python3-pip python3-setuptools python3-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /opt/grobid
RUN rm -rf /opt/grobid/grobid-home/models/*-with_ELMo \
&& rm -rf /opt/grobid/grobid-service \
&& ln -sf software-mentions/resources/ resources

COPY --from=builder /opt/grobid .

RUN python3 -m pip install pip --upgrade
# the last command above is just a hack to make the lexicon loader working

# install DeLFT via pypi
RUN pip3 install requests delft==0.3.3
# link the data directory to /data
# the current working directory will most likely be /opt/grobid
RUN mkdir -p /data \
&& ln -s /data /opt/grobid/data \
&& ln -s /data ./data
COPY --from=builder /opt/grobid/grobid-home/models ./grobid-home/models
COPY --from=builder /opt/grobid/software-mentions ./software-mentions/
COPY --from=builder /opt/grobid/software-mentions-source/resources/config/config.yml ./software-mentions/resources/config/config.yml
COPY --from=builder /opt/grobid/software-mentions-source/resources/lexicon/ ./software-mentions/resources/lexicon/

# disable python warnings (and fix logging)
ENV PYTHONWARNINGS="ignore"
COPY --from=builder /opt/grobid/software-mentions /opt/grobid/software-mentions
COPY --from=builder /opt/Pub2TEI /opt/Pub2TEI

WORKDIR /opt/grobid
VOLUME ["/opt/grobid/grobid-home/tmp"]

ENV JAVA_OPTS=-Xmx4g
#WORKDIR /opt/grobid

# install jep (and temporarily the matching JDK)
ENV JDK_URL=https://download.java.net/java/GA/jdk17.0.2/dfd4a8d0985749f896bed50d7138ee7f/8/GPL/openjdk-17.0.2_linux-x64_bin.tar.gz
RUN curl --fail --show-error --location -q ${JDK_URL} -o /tmp/openjdk.tar.gz
RUN mkdir /tmp/jdk-17
RUN tar xvfz /tmp/openjdk.tar.gz --directory /tmp/jdk-17 --strip-components 1 --no-same-owner
RUN /tmp/jdk-17/bin/javac -version
RUN JAVA_HOME=/tmp/jdk-17 pip3 install jep==4.0.2
RUN rm -f /tmp/openjdk.tar.gz
RUN rm -rf /tmp/jdk-17
ENV LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/jep:grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep:${LD_LIBRARY_PATH}
# remove libjep.so because we are providing our own version in the virtual env above
RUN rm /opt/grobid/grobid-home/lib/lin-64/jep/libjep.so

# preload embeddings

COPY --from=builder /opt/grobid-source/grobid-home/scripts/preload_embeddings.py .
COPY --from=builder /opt/grobid-source/grobid-home/config/resources-registry.json .
#RUN python3 preload_embeddings.py --embedding word2vec --registry ./resources-registry.json
RUN ln -s /opt/grobid /opt/delft

COPY --from=builder /opt/grobid-source/software-mentions /opt/grobid/software-mentions
COPY --from=builder /root/.m2/repository/org /opt/grobid/software-mentions/lib/org

# install Pub2TEI
WORKDIR /opt/
RUN wget https://github.com/kermitt2/Pub2TEI/archive/refs/heads/master.zip
RUN unzip master.zip
RUN mv Pub2TEI-master Pub2TEI

WORKDIR /opt/grobid/software-mentions

RUN mkdir /opt/grobid/delft
RUN mkdir /opt/grobid/delft/delft
COPY --from=builder /opt/grobid-source/grobid-home/config/resources-registry.json /opt/grobid/delft/delft/resources-registry.json

# trigger gradle wrapper install
RUN ./gradlew --version

# install all the ML models
RUN ./gradlew copyModels installModels && rm -rf resources/models && rm -f /opt/grobid/grobid-home/models/software/model.wapiti.gz && rm -f /opt/grobid/grobid-home/models/software-BERT-0.3.2.zip && rm -f /opt/grobid/grobid-home/models/context_bert-0.3.2.zip && rm -f /opt/grobid/grobid-home/models/context_used_bert-0.3.2.zip && rm -f /opt/grobid/grobid-home/models/context_shared_bert-0.3.2.zip && rm -f /opt/grobid/grobid-home/models/context_creation_bert-0.3.2.zip

RUN ./gradlew clean assemble install --no-daemon --stacktrace --info -x test

CMD ["sh", "-c", "java --add-opens java.base/java.lang=ALL-UNNAMED -jar build/libs/software-mentions-0.8.0-onejar.jar server resources/config/config.yml"]
# this will build and load embeddings on the image forever (only if required by the config) :)
# LF: AFAIK this is not needed at the moment as all the models are running with bert, but might
# be a solution if we want to support the GRU version
# RUN python3 preload_embeddings.py --registry ./resources-registry.json --embedding word2vec

ARG GROBID_VERSION
ENV GROBID_VERSION=${GROBID_VERSION:-latest}
ENV SOFTWARE_MENTIONS_OPTS "-Djava.library.path=/opt/grobid/grobid-home/lib/lin-64:/usr/local/lib/python3.8/dist-packages/jep --add-opens java.base/java.lang=ALL-UNNAMED"

CMD ["./software-mentions/bin/software-mentions", "server", "software-mentions/resources/config/config.yml"]

LABEL \
authors="The contributors" \
org.label-schema.name="software-mentions" \
org.label-schema.description="Image with software-mentions service" \
org.label-schema.url="https://github.com/softcite/software-mentions" \
org.label-schema.version=${GROBID_VERSION}
org.label-schema.version=${GROBID_VERSION}
Loading