diff --git a/apps/common/Dockerfile b/apps/common/Dockerfile index 08d99c3ef5..b7a575490c 100644 --- a/apps/common/Dockerfile +++ b/apps/common/Dockerfile @@ -143,6 +143,8 @@ COPY bin/build_jieba_dict_cache.py / RUN \ /build_jieba_dict_cache.py && \ rm /build_jieba_dict_cache.py && \ + chown mediacloud:mediacloud /var/tmp/jieba.cache && \ + ls -l /var/tmp/jieba.cache && \ true # Symlink Log::Log4perl configuration to where it's going to be found diff --git a/apps/common/src/python/mediawords/solr/request.py b/apps/common/src/python/mediawords/solr/request.py index 5694c0e0da..4e79984c6a 100644 --- a/apps/common/src/python/mediawords/solr/request.py +++ b/apps/common/src/python/mediawords/solr/request.py @@ -24,6 +24,8 @@ __QUERY_HTTP_TIMEOUT = 15 * 60 """Timeout of a single HTTP query.""" +# Testing alias!! +SOLR_COLLECTION = 'mediacloud2' class _AbstractSolrRequestException(Exception, metaclass=abc.ABCMeta): """Abstract .solr.request exception.""" @@ -59,7 +61,7 @@ def __wait_for_solr_to_start(config: Optional[CommonConfig]) -> None: """Wait for Solr to start and collections to become available, if needed.""" # search for an empty or rare term here because searching for *:* sometimes causes a timeout for some reason - sample_select_url = f"{config.solr_url()}/mediacloud/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json" + sample_select_url = f"{config.solr_url()}/{SOLR_COLLECTION}/select?q=BOGUSQUERYTHATRETURNSNOTHINGNADA&rows=1&wt=json" connected = False @@ -191,7 +193,7 @@ def solr_request(path: str, if not params: params = {} - abs_uri = furl(f"{solr_url}/mediacloud/{path}") + abs_uri = furl(f"{solr_url}/{SOLR_COLLECTION}/{path}") abs_uri = abs_uri.set(params) abs_url = str(abs_uri) diff --git a/apps/common/src/python/mediawords/util/config/__init__.py b/apps/common/src/python/mediawords/util/config/__init__.py index 08f12feb8e..53819ff3e5 100644 --- a/apps/common/src/python/mediawords/util/config/__init__.py +++ b/apps/common/src/python/mediawords/util/config/__init__.py @@ -46,6 +46,16 @@ def env_value(name: str, required: bool = True, allow_empty_string: bool = False return value +def env_bool(name: str, default: bool = False) -> bool: + """ + Retrieve boolean from environment variable; should be 0 or 1. + + :param name: Environment variable name. + :param default: default value, if no value found. + """ + + value = os.environ.get(name, default) + return bool(int(value)) def file_with_env_value(name: str, allow_empty_string: bool = False, encoded_with_base64: bool = False) -> str: """ diff --git a/apps/common/src/requirements.txt b/apps/common/src/requirements.txt index 3bb17a43d9..6b8237199f 100644 --- a/apps/common/src/requirements.txt +++ b/apps/common/src/requirements.txt @@ -43,6 +43,10 @@ furl==2.1.0 # Chinese language tokenizer, stemmer, etc. jieba==0.42.1 +# For Jinja2 2.11.3, which requests MarkupSafe>=0.23 and is now +# getting version 2.1.1, which removed a deprecated function. +MarkupSafe==2.0.1 + # Parsing email templates Jinja2==2.11.3 diff --git a/apps/docker-compose.dist.yml b/apps/docker-compose.dist.yml index 32fadf7436..84a3960c6f 100644 --- a/apps/docker-compose.dist.yml +++ b/apps/docker-compose.dist.yml @@ -1813,7 +1813,8 @@ services: placement: constraints: # Must run on the host with Temporal Grafana data volume - - node.labels.role-temporal-grafana == true + # - node.labels.role-temporal-grafana == true + - node.labels.role-monitoring == true # Worker count replicas: 1 resources: @@ -1909,7 +1910,8 @@ services: placement: constraints: # Must run on the host with Temporal Prometheus data volume - - node.labels.role-temporal-prometheus == true + # - node.labels.role-temporal-prometheus == true + - node.labels.role-monitoring == true # Worker count replicas: 1 resources: @@ -2237,7 +2239,33 @@ services: # RAM limit memory: "2G" - + # + # Temporal Prometheus (Temporal's statistics store) + # ------------------------------------------------- + # + temporal-alertmanager: + image: thepsalmist/temporal-alertmanager:release_monitoring_v2 + init: true + depends_on: + - temporal-prometheus + networks: + - default + expose: + - "9093" + volumes: + - vol_temporal_alertmanager_data:/opt/alertmanager/data/ + deploy: + <<: *endpoint-mode-dnsrr + placement: + constraints: + # Must run on the host with Temporal Alertmanager data volume + - node.labels.role-monitoring == true + # Worker count + replicas: 1 + resources: + limits: + cpus: "1" + memory: "1G" # # Networks # ======== @@ -2544,3 +2572,11 @@ volumes: type: none o: bind device: /space/mediacloud/vol_temporal_grafana_data + + # Temporal Grafana data + vol_temporal_alertmanager_data: + driver: local + driver_opts: + type: none + o: bind + device: /space/mediacloud/vol_temporal_alertmanager_data diff --git a/apps/extract-and-vector/bin/extract_and_vector_worker.py b/apps/extract-and-vector/bin/extract_and_vector_worker.py index 0738c6e200..7a21a67864 100755 --- a/apps/extract-and-vector/bin/extract_and_vector_worker.py +++ b/apps/extract-and-vector/bin/extract_and_vector_worker.py @@ -4,6 +4,7 @@ from mediawords.db import connect_to_db from mediawords.job import JobBroker +from mediawords.util.config import env_bool from mediawords.util.log import create_logger from mediawords.util.perl import decode_object_from_bytes_if_needed from extract_and_vector.dbi.stories.extractor_arguments import PyExtractorArguments @@ -69,8 +70,10 @@ def run_extract_and_vector(stories_id: int, use_cache: bool = False, use_existin log.info("Extracting story {}...".format(stories_id)) + no_dedup_sentences = env_bool('MC_NO_DEDUP_SENTENCES', True) try: - extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing) + extractor_args = PyExtractorArguments(use_cache=use_cache, use_existing=use_existing, + no_dedup_sentences=no_dedup_sentences) extract_and_process_story(db=db, story=story, extractor_args=extractor_args) except Exception as ex: diff --git a/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm b/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm index a260cb109c..223dea7fcd 100644 --- a/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm +++ b/apps/import-solr-data/src/perl/MediaWords/Solr/Dump.pm @@ -55,7 +55,7 @@ Readonly my @SOLR_FIELDS => qw/stories_id media_id publish_date publish_day publ text title language processed_stories_id tags_id_stories timespans_id/; # how many sentences to fetch at a time from the postgres query -Readonly my $FETCH_BLOCK_SIZE => 100; +Readonly my $FETCH_BLOCK_SIZE => 200; # default time sleep when there are less than MIN_STORIES_TO_PROCESS: Readonly my $DEFAULT_THROTTLE => 60; @@ -601,6 +601,7 @@ Options: * throttle -- sleep this number of seconds between each block of stories (default 60) * full -- shortcut for: update=false, empty_queue=true, throttle=1; assume and optimize for static queue * skip_logging -- skip logging the import into the solr_import_stories or solr_imports tables (default=false) +* skip_update_snapshot -- skip setting snapshots.searchable=true (default=true) The import will run in blocks of "max_queued_stories" at a time. The function will keep trying to find stories to import. If there are less than @@ -627,6 +628,7 @@ sub import_data($;$) my $empty_queue = $options->{ empty_queue } // 0; my $throttle = $options->{ throttle } // $DEFAULT_THROTTLE; my $skip_logging = $options->{ skip_logging } // 0; + my $skip_update_snapshot = $options->{ skip_update_snapshot } // 1; my $daemon = $options->{ daemon } // 0; $_last_max_queue_stories_id = 0; @@ -669,7 +671,7 @@ sub import_data($;$) _save_import_log( $db, $stories_ids ); } - if ( !$skip_logging ) + if ( !$skip_logging && !$skip_update_snapshot ) { _update_snapshot_solr_status( $db ); } diff --git a/apps/postgresql-pgbouncer/conf/pgbouncer.ini b/apps/postgresql-pgbouncer/conf/pgbouncer.ini index eb3f28662c..f7a14b215d 100644 --- a/apps/postgresql-pgbouncer/conf/pgbouncer.ini +++ b/apps/postgresql-pgbouncer/conf/pgbouncer.ini @@ -1,5 +1,6 @@ [databases] -* = host=postgresql-server port=5432 user=mediacloud +; PhilB 5/6/22: PG server running on postgresql EC2 server w/o docker +* = host=172.30.0.58 port=5432 user=mediacloud [pgbouncer] diff --git a/apps/postgresql-server/bin/apply_migrations.sh b/apps/postgresql-server/bin/apply_migrations.sh index bcc2d702e0..77267db3a7 100755 --- a/apps/postgresql-server/bin/apply_migrations.sh +++ b/apps/postgresql-server/bin/apply_migrations.sh @@ -14,7 +14,8 @@ MIGRATIONS_DIR="/opt/postgresql-server/pgmigrate/migrations" TEMP_PORT=12345 # In case the database is in recovery, wait for up to 1 hour for it to complete -PGCTL_START_TIMEOUT=3600 +# PLB: increased to three hours +PGCTL_START_TIMEOUT=10800 if [ ! -d "${MIGRATIONS_DIR}" ]; then echo "Migrations directory ${MIGRATIONS_DIR} does not exist." diff --git a/apps/rabbitmq-server/Dockerfile b/apps/rabbitmq-server/Dockerfile index b5ff424313..d31e142257 100644 --- a/apps/rabbitmq-server/Dockerfile +++ b/apps/rabbitmq-server/Dockerfile @@ -2,7 +2,7 @@ # RabbitMQ server # -FROM gcr.io/mcback/base:latest +FROM gcr.io/mcback/base:release # Add RabbitMQ APT repository RUN \ diff --git a/apps/rabbitmq-server/conf/enabled_plugins b/apps/rabbitmq-server/conf/enabled_plugins index d8bb228458..402c318d8f 100644 --- a/apps/rabbitmq-server/conf/enabled_plugins +++ b/apps/rabbitmq-server/conf/enabled_plugins @@ -1 +1 @@ -[rabbitmq_amqp1_0,rabbitmq_management,rabbitmq_management_visualiser,rabbitmq_shovel,rabbitmq_shovel_management]. +[rabbitmq_amqp1_0,rabbitmq_management,rabbitmq_management_visualiser,rabbitmq_shovel,rabbitmq_shovel_management,rabbitmq_prometheus]. diff --git a/apps/solr-base/Dockerfile b/apps/solr-base/Dockerfile index 0ff5015f9c..6c76d33d8e 100644 --- a/apps/solr-base/Dockerfile +++ b/apps/solr-base/Dockerfile @@ -19,5 +19,18 @@ RUN \ RUN mkdir -p /usr/src/ COPY src/solr/ /usr/src/solr/ +# Try to create 64-bit enabled mediacloud64 collection by cloning config +# NOTE: collections/mediacloud/conf/solrconfig.xml uses +# ${mediacloud.luceneMatchVersion} ${mediacloud.solr_webapp_dir} ${mediacloud.solr_dist_dir} +# which reference JVM properties set in solr-shard/bin/solr-shard.sh +# ALSO: core.properties has "instanceDir=/var/lib/solr/mediacloud" (dir does not exist?!) +# will be wacked to .../mediacloud64 (also does not exist) +RUN \ + mkdir -p /usr/src/solr/collections/mediacloud64 && \ + cp -rp /usr/src/solr/collections/mediacloud/* /usr/src/solr/collections/mediacloud64/ && \ + sed -i.32 's/mediacloud/mediacloud64/' /usr/src/solr/collections/mediacloud64/core.properties && \ + sed -i.32 '/ 10000 + for: 5m + labels: + severity: critical + annotations: + summary: "Rabbitmq queue filling up" + description: "Queue is filling up" + + - alert: RabbitmqTooManyMessagesInQueue + expr: rabbitmq_queue_messages_ready{queue="MediaWords::Job::ExtractAndVector"} > 40000 + for: 5m + labels: + severity: warning + annotations: + summary: "Rabbitmq too many mesages in queue" + description: "Queue is filling up (> 20000 msgs)" diff --git a/apps/temporal-prometheus/prometheus.yml b/apps/temporal-prometheus/prometheus.yml index 0a62dfbacb..b4209f0e04 100644 --- a/apps/temporal-prometheus/prometheus.yml +++ b/apps/temporal-prometheus/prometheus.yml @@ -1,10 +1,20 @@ global: - scrape_interval: 5s - scrape_timeout: 5s + scrape_interval: 15s + scrape_timeout: 30s + +rule_files: + - 'alert.rules' + +alerting: + alertmanagers: + - static_configs: + - targets: + - "alertmanager:9093" scrape_configs: - job_name: 'prometheus' + metrics_path: /metrics static_configs: - targets: - 'localhost:9090' @@ -20,3 +30,11 @@ scrape_configs: - 'temporal-server:9093' # worker - 'temporal-server:9094' + + # rabbitmq monitoring from rabbitmq_prometheus plugin + - job_name: 'rabbitmq' + metrics_path: /metrics + static_configs: + - targets: + - "rabbitmq-server:15692" + diff --git a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Topics/Timespans.pm b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Topics/Timespans.pm index 3fbb4e236e..cb87851cc7 100644 --- a/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Topics/Timespans.pm +++ b/apps/webapp-api/src/perl/MediaWords/Controller/Api/V2/Topics/Timespans.pm @@ -88,7 +88,7 @@ SQL snapshots_id FROM timespans AS t where - topics_id = ? AND + topics_id = ? $snapshot_clause $focus_clause $timespan_clause