From 66ae9ab6eb3751ae7507156e696e3215cab0a496 Mon Sep 17 00:00:00 2001 From: Christian Segundo Date: Mon, 7 Mar 2022 20:58:31 +0100 Subject: Full refactor --- misc/ngram.sh | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100755 misc/ngram.sh (limited to 'misc/ngram.sh') diff --git a/misc/ngram.sh b/misc/ngram.sh new file mode 100755 index 0000000..ce25a49 --- /dev/null +++ b/misc/ngram.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +set -euo pipefail + +NGRAM_DATA_URL="https://languagetool.org/download/ngram-data/" +NGRAM_ARCHIVES=$(curl -sS "${NGRAM_DATA_URL}" | grep "\.zip" | grep -oP 'ngrams-\w\w-\d+\.zip' | sort | uniq) + +TARGET_LANGS="${1}" +TARGET_DIR="${2}" + +download_ngram() { + local lang=$1 + local version=$2 + local directory=$3 + + echo "Downloading: ${lang} - ${version}" + rm -rf "${directory}/${lang}" + curl --progress-bar "${NGRAM_DATA_URL}ngrams-${lang}-${version}.zip" |\ + bsdtar -x -f - -C "${directory}" +} + + +VERSION_FILE_CONTENT= + +if test -f "${TARGET_DIR}/version"; then + VERSION_FILE_CONTENT=$(cat "${TARGET_DIR}/version") + >"${TARGET_DIR}/version" +fi + +while read -r ngram_archive; do + for lang in ${TARGET_LANGS/,/ }; do + if echo "$(echo ${ngram_archive} | cut -d'-' -f2)" | grep -q $lang; then + version=$(echo ${ngram_archive} | cut -d'-' -f3 | cut -d'.' -f1) + if echo "${VERSION_FILE_CONTENT}" | grep -q "$lang"; then + existing_version=$(echo "${VERSION_FILE_CONTENT}" | grep "${lang}" | cut -d':' -f2) + current_version=$(echo "${ngram_archive}" | cut -d'-' -f3 | cut -d'.' -f1) + if [[ "${existing_version}" -lt "${current_version}" ]]; then + download_ngram "${lang}" "${version}" "${TARGET_DIR}" + fi + else + download_ngram "${lang}" "${version}" "${TARGET_DIR}" + fi + echo "${lang}:${version}" >> "${TARGET_DIR}/version" + fi + done +done <<< "${NGRAM_ARCHIVES}" + +for i in $(ls -d "${TARGET_DIR}"/*/); do + realdir="$(basename ${i})" + delete=yes + for lang in ${TARGET_LANGS/,/ }; do + if echo "${realdir}" | grep -q "${lang}"; then + delete=no + fi + done + if [ "${delete}" == "yes" ]; then + rm -rf "${TARGET_DIR}/${realdir}" + fi +done -- cgit v1.2.3