https://htn20190109.hatenablog.com/entry/2025/08/08/233311

https://nikkie-ftnext.hatenablog.com/entry/ocr-with-tesseract-macos-japanese-text-environment
https://www.delftstack.com/ja/howto/python/selenium-scroll-down-python/

mac OS: Monterey 12.7.6

前提:
メール送信(mac)設定済み
下記をrootのcrontabに設定済
*/10 * * * * /usr/sbin/postfix start

シェル名: check_web_updates_v2.sh

設計:
①前回取得分を削除、今回ファイルを、前回ファイルにリネーム、テキストファイル削除
②seleniumでスクリーンショット取得、(ヘッダー変動画像回避あり、スクロール回数指定あり)
③前回取得分と今回取得分の比較
④差分が閾値以上ある場合、Tesseractで画像からテキストファイルを作成
⑤テキストファイルからチェック対象文字存在チェック。存在する場合、メール送信

注意事項:
OS再起動した場合、一度画面からログインしないとchromeを起動できない
shellでは全コマンドを原則、フルパスで記載する必要あり( cronで実行エラーとなるため )

下記にフルディスクアクセス権限付与
/usr/bin/open
/usr/sbin/cron
/usr/sbin/smbd

confファイル:
check_web_updates_v2.conf

URLキー,ヘッダーサイズ,スクロール回数,閾値,URL値
のcsvファイル

(例)
key1,0,4,0.9,https://hoge.example.com
key2,300,4,0.9,https://fuga.example.com
key3,0,4,0.9,https://piyo.example.com

画像ファイル名:
<URLキー>_<スクロール連番>_af.png
<URLキー>_<スクロール連番>_bf.png

テキストファイル名:
<URLキー>_<スクロール連番>_af.txt

■スクリーンショットを取得するPython スクリプト

python3.11

pip3.11 install selenium
pip3.11 install chromedriver_binary

cat <<-'EOF' > get_screenshots_v2.py

from selenium import webdriver
import chromedriver_binary
import time
import csv
import os

base_dir_name = '/Users/testuser/check_web_updates_v2'
image_dir_name = '/Volumes/share/Image_v2'
driver = webdriver.Chrome()

with open(base_dir_name + "/" + "check_web_updates_v2.conf", "r", encoding="utf8") as csv_file:
f = csv.reader(csv_file, delimiter=",", doublequote=True, lineterminator="\n", quotechar='"', skipinitialspace=False)
for row in f:
key = row[0]
header = int(row[1])
scroll = int(row[2])
threshold = float(row[3])
url = row[4]

driver.get(url)
time.sleep(10)

# ヘッダー変動画像回避
if header > 0:
driver.execute_script("window.scrollBy(0, " + str(header) + ");")
time.sleep(5)

for i in range(scroll):
file_name = image_dir_name +'/' + key + '_' + str(i) + '_af.png'
driver.save_screenshot(file_name)
driver.execute_script("window.scrollBy(0, 500);")
time.sleep(5)

driver.close()

EOF

python3.11 get_screenshots_v2.py

■２画像の画素値を比較し類似度を算出するPython スクリプト

python3.11

pip3.11 install pillow

cat <<-'EOF' > diff_screenshots_v2.py

import numpy as np
import os
import csv
import sys
from PIL import Image

base_dir_name = '/Users/testuser/check_web_updates_v2'
image_dir_name = '/Volumes/share/Image_v2'

output = ""

# file1とfile2がある場合
if os.path.exists(file1) and os.path.exists(file2):
image1 = Image.open(file1)
image2 = Image.open(file2)
array1 = np.array(image1)
array2 = np.array(image2)

similarity = np.count_nonzero(array1 == array2) / np.count_nonzero(array1 == array1)

# print(type(similarity) )

# print( similarity )
if similarity < threshold :
output = output + ';' + key + '_' + str(i)

# print("output:" + output)

sys.stdout.write(output)

EOF

python3.11 diff_screenshots_v2.py

■本体シェル

cat <<-'EOF' > check_web_updates_v2.sh
#!/bin/bash

BASE_DIR_NAME=/Users/testuser/check_web_updates_v2
IMAGE_DIR_NAME=/Volumes/share/Image_v2

# 開始ログ
/bin/echo "[$(date '+%Y/%m/%d %H:%M:%S')] job start" >> ${BASE_DIR_NAME}/check_web_updates_v2.log

# 1. PCが起動していない場合、何もせずに終了する

/sbin/ping -c 1 10.11.21.1

if [ "$?" -ne "0" ] ; then
exit 0
fi

# 1. Image_DIRをマウント
/usr/bin/open "smb://admin:admin@10.11.21.4/share"

/bin/sleep 10

# 2. 前回画像ファイルを削除、今回画像ファイルを、前回画像ファイルにリネーム。テキストファイル削除

for i in $(/bin/cat ${BASE_DIR_NAME}/check_web_updates_v2.conf) ; do

key=$(/bin/echo "${i}" | cut -d"," -f1)
scroll=$(/bin/echo "${i}" | cut -d"," -f3)

# /bin/echo $key
# /bin/echo $scroll
for j in $(/usr/bin/seq ${scroll}) ; do
# /bin/echo $((j-1))
# /bin/echo "${IMAGE_DIR_NAME}/${key}_$((j-1))_bf.png"

/bin/rm -rf "${IMAGE_DIR_NAME}/${key}_$((j-1))_bf.png"
/bin/mv "${IMAGE_DIR_NAME}/${key}_$((j-1))_af.png" "${IMAGE_DIR_NAME}/${key}_$((j-1))_bf.png"
/bin/rm -rf "${IMAGE_DIR_NAME}/${key}_$((j-1))_af.txt"
done
done

# 3. スクリーンショット取得

/usr/local/bin/python3.11 ${BASE_DIR_NAME}/get_screenshots_v2.py

# 4. 画像フィル差分比較

result=$(/usr/local/bin/python3.11 ${BASE_DIR_NAME}/diff_screenshots_v2.py)
# /bin/echo "${result}"

# 5. 差分が閾値以上ある場合、Tesseractで画像からテキストファイルを作成
result2=""

if [ -n "${result}" ] ; then

for i in $(/bin/cat ${BASE_DIR_NAME}/check_web_updates_v2.conf) ; do

key=$(/bin/echo "${i}" | /usr/bin/cut -d"," -f1)
scroll=$(/bin/echo "${i}" | /usr/bin/cut -d"," -f3)

# /bin/echo $key
# /bin/echo $scroll

for j in $(/usr/bin/seq "${scroll}") ; do

file_key=";${key}_$((j-1))"
# echo "${file_key}"

# ${result}に${file_key}が含まれているならOCRコマンド実行
count=$(/bin/echo "${result}" | /usr/bin/grep "${file_key}" | /usr/bin/wc -l)
if [ "${count}" -eq "1" ] ; then

# テキストファイルからチェック対象文字存在チェック。存在する場合、URLキーを保存

filebase="${IMAGE_DIR_NAME}/${key}_$((j-1))_af"

/usr/local/bin/tesseract -l jpn "${filebase}.png" "${filebase}"

count2=$(/bin/cat "${filebase}.txt" | \
/usr/bin/grep -e "戦国自衛隊" \
-e "戦国自衛隊" \
-e "戦国自衛隊" \
-e "戦国自衛隊" \
-e "戦国自衛隊" \
-e "戦国自衛隊" \
-e "戦国自衛隊" | \
/usr/bin/wc -l)
if [ "${count2}" -gt 0 ] ; then
result2="${result2};${key}_$((j-1))"
fi

fi
done
done
fi

# 6. 該当がある場合、本文に記載しメール送信

if [ -n "${result2}" ] ; then
result3="${result2//;/\\n}"
builtin echo -e "${result3}" | /usr/bin/mail -s "website_updated_v2" hoge@example.com
fi

# 7. Image_DIRをアンマウント
/sbin/umount /Volumes/share

# 終了ログ
/bin/echo "[$(date '+%Y/%m/%d %H:%M:%S')] job end" >> ${BASE_DIR_NAME}/check_web_updates_v2.log

exit 0

EOF

chmod +x check_web_updates_v2.sh

./check_web_updates_v2.sh

■クーロン登録

crontab -e

40 6,18 * * * /Users/testuser/check_web_updates/check_web_updates_v2.sh

crontab -l

log stream --info --predicate 'process == "cron"'
log show --info --predicate 'process == "cron"' --start '2017-05-25'