Skip to content

Commit

Permalink
crawler Firefox 🦊 support
Browse files Browse the repository at this point in the history
Changelog: feature
  • Loading branch information
biolds committed Oct 14, 2023
1 parent 5454df8 commit 093a284
Show file tree
Hide file tree
Showing 31 changed files with 506 additions and 178 deletions.
52 changes: 43 additions & 9 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ unit_tests:
script:
- 'grep ^Depends: debian/control | sed -e "s/.*},//" -e "s/,//g" | xargs apt install -y'
- /etc/init.d/postgresql start
- mkdir -p /var/lib/sosse/screenshots /var/lib/sosse/html /var/lib/sosse/downloads/0 && chown -R www-data:www-data /var/lib/sosse/
- mkdir -p /var/lib/sosse/screenshots /var/lib/sosse/html /var/lib/sosse/log && chown -R www-data:www-data /var/lib/sosse/ /var/log/sosse
- /usr/bin/python3 /root/httpbin/httpbin/manage.py runserver 0.0.0.0:8000 &
- export PYTHONPATH="$CI_PROJECT_DIR"
- /tmp/sudo_sosse default_conf | sed -e "s/^#debug=.*/debug=true/" -e "s/^#browser_options=\(.*\)/browser_options=\1 --no-sandbox --disable-dev-shm-usage/" -e "s/#dl_check_time=.*/dl_check_time=1/" > /etc/sosse/sosse.conf
- /tmp/sudo_sosse default_conf | sed -e "s/^#debug=.*/debug=true/" -e "s/^#chromium_options=\(.*\)/chromium_options=\1 --no-sandbox --disable-dev-shm-usage/" -e "s/#dl_check_time=.*/dl_check_time=1/" > /etc/sosse/sosse.conf
- /tmp/sudo_sosse test -v3 --failfast
- /tmp/sudo_sosse load_se tests/opensearch.xml
- /tmp/sudo_sosse update_se
Expand All @@ -45,7 +45,7 @@ unit_tests:
- python3-coverage html
- python3-coverage xml

functional_tests:
.functional_tests: &functional_tests
image: biolds/sosse:pip-test
stage: test
needs:
Expand All @@ -62,6 +62,16 @@ functional_tests:
- mv /var/log/sosse log
- test "$(cat /tmp/ret_code)" -eq 0

functional_tests_chromium:
<<: *functional_tests
variables:
BROWSER: chromium

functional_tests_firefox:
<<: *functional_tests
variables:
BROWSER: firefox

migrations:
image: biolds/sosse:debian-test
stage: test
Expand Down Expand Up @@ -112,7 +122,8 @@ doc:
- doc/build/*
needs:
- doc_gen
- functional_tests
- functional_tests_chromium
#- functional_tests_firefox -> commenteed because only the artifacts (screenshot) from Chromium are needed
script:
- make _build_doc
- test -n "$RTD_TOKEN" && curl -X POST -d "branches=main" -d "token=$RTD_TOKEN" -d "default_branch=main" https://readthedocs.org/api/v2/webhook/sosse/236935/ || true
Expand All @@ -127,7 +138,8 @@ pip_pkg:
- doc
- migrations
- unit_tests
- functional_tests
- functional_tests_chromium
- functional_tests_firefox
- static_checks
script:
- sed -e "s/^SOSSE_VERSION_TAG = .*/SOSSE_VERSION_TAG = '${CI_COMMIT_TAG/v/}'/" -i sosse/settings.py
Expand All @@ -142,7 +154,8 @@ debian_pkg:
- doc
- migrations
- unit_tests
- functional_tests
- functional_tests_chromium
- functional_tests_firefox
- static_checks
artifacts:
paths:
Expand Down Expand Up @@ -192,7 +205,7 @@ doc_test_pip:
- apt install -y make jq
- make _doc_test_pip

debian_pkg_check:
.debian_pkg_check: &debian_pkg_check
image: debian:bookworm
stage: build_check
artifacts:
Expand All @@ -212,7 +225,17 @@ debian_pkg_check:
- mv /var/log/sosse log
- test "$(cat /tmp/ret_code)" -eq 0

pip_pkg_check:
debian_pkg_check_chromium:
<<: *debian_pkg_check
variables:
BROWSER: chromium

debian_pkg_check_firefox:
<<: *debian_pkg_check
variables:
BROWSER: firefox

.pip_pkg_check: &pip_pkg_check
image: biolds/sosse:pip-test
stage: build_check
artifacts:
Expand All @@ -230,6 +253,16 @@ pip_pkg_check:
- mv /var/log/sosse log
- test "$(cat /tmp/ret_code)" -eq 0

pip_pkg_check_chromium:
<<: *pip_pkg_check
variables:
BROWSER: chromium

pip_pkg_check_firefox:
<<: *pip_pkg_check
variables:
BROWSER: firefox

discord_tag_notif:
image: debian:bookworm
stage: notif
Expand All @@ -239,7 +272,8 @@ discord_tag_notif:
needs:
- doc_gen
- pip_pkg
- debian_pkg_check
- debian_pkg_check_chromium
- debian_pkg_check_firefox
script: |
test -n "$DISCORD_TAG_NOTIF_URL"
apt update
Expand Down
7 changes: 4 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ ADD sosse/ sosse/
RUN virtualenv /venv
RUN /venv/bin/pip install ./ && /venv/bin/pip install uwsgi && /venv/bin/pip cache purge
ADD debian/sosse.conf /etc/nginx/sites-enabled/default
RUN mkdir -p /etc/sosse/ /etc/sosse_src/ /var/log/sosse /var/log/uwsgi
RUN /venv/bin/sosse-admin default_conf | sed -e 's/^#db_pass.*/db_pass=sosse/' -e 's/^#\(browser_options=.*\)$/\1 --no-sandbox --disable-dev-shm-usage/' > /etc/sosse_src/sosse.conf
RUN mkdir -p /etc/sosse/ /etc/sosse_src/ /var/log/sosse /var/log/uwsgi /var/www/.cache /var/www/.mozilla
RUN /venv/bin/sosse-admin default_conf | sed -e 's/^#db_pass.*/db_pass=sosse/' -e 's/^#\(chromium_options=.*\)$/\1 --no-sandbox --disable-dev-shm-usage/' > /etc/sosse_src/sosse.conf
ADD debian/uwsgi.* /etc/sosse_src/
RUN chown -R root:www-data /etc/sosse /etc/sosse_src && chmod 750 /etc/sosse_src/ && chmod 640 /etc/sosse_src/*
RUN chown www-data:www-data /var/log/sosse /var/www/.cache /var/www/.mozilla

WORKDIR /
USER postgres
Expand All @@ -26,7 +27,7 @@ USER root
RUN echo '#!/bin/bash -x \n \
/etc/init.d/postgresql start \n \
test -e /etc/sosse/sosse.conf || (cp -p /etc/sosse_src/* /etc/sosse/) \n \
mkdir -p /run/sosse /var/lib/sosse/html/ \n \
mkdir -p /run/sosse /var/log/sosse /var/lib/sosse/html/ \n \
touch /var/log/sosse/{debug.log,main.log,crawler.log,uwsgi.log,webserver.log} \n \
chown -R www-data:www-data /run/sosse /var/log/sosse/ /var/lib/sosse \n \
/venv/bin/sosse-admin migrate \n \
Expand Down
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
TMP ?= /tmp
BROWSER ?= chromium
current_dir = $(shell pwd)

.PHONY: _pip_pkg pip_pkg _pip_pkg_push pip_pkg_push _deb \
Expand Down Expand Up @@ -130,7 +131,7 @@ _pip_pkg_functional_tests:
_common_pip_functional_tests:
cp doc/code_blocks.json /tmp/code_blocks.json
grep -q 'sosse-admin default_conf' /tmp/code_blocks.json
sed -e 's#sosse-admin default_conf#sosse-admin default_conf | sed -e \\"s/^.browser_options=.*/browser_options=--enable-precise-memory-info --disable-default-apps --incognito --headless --no-sandbox --disable-dev-shm-usage/\\" -e \\"s/^.browser_crash_retry=.*/browser_crash_retry=3/\\" -e \\"s/^.crawler_count=.*/crawler_count=1/\\" -e \\"s/^.debug=.*/debug=true/\\"#' -i /tmp/code_blocks.json # add --no-sandbox --disable-dev-shm-usage to chromium's command line
sed -e 's#sosse-admin default_conf#sosse-admin default_conf | sed -e \\"s/^.chromium_options=.*/chromium_options=--enable-precise-memory-info --disable-default-apps --incognito --headless --no-sandbox --disable-dev-shm-usage/\\" -e \\"s/^.browser_crash_retry=.*/browser_crash_retry=3/\\" -e \\"s/^.crawler_count=.*/crawler_count=1/\\" -e \\"s/^.debug=.*/debug=true/\\"#' -e \\"s/^.default_browser=.*/default_browser=$(BROWSER)/\\" -i /tmp/code_blocks.json # add --no-sandbox --disable-dev-shm-usage to chromium's command line
echo 'SOSSE_ADMIN: /opt/sosse-venv/bin/sosse-admin' > tests/robotframework/config.yaml

_deb_pkg_functional_tests:
Expand All @@ -141,11 +142,12 @@ _deb_pkg_functional_tests:
grep -q 'apt install -y sosse' /tmp/code_blocks.json
sed -e 's#apt install -y sosse#apt install -y sudo; dpkg -i deb/*.deb ; /etc/init.d/postgresql start \& bash ./tests/wait_for_pg.sh#' -i /tmp/code_blocks.json
bash ./tests/doc_test.sh /tmp/code_blocks.json install/debian
sed -e 's/^.browser_options=.*/browser_options=--enable-precise-memory-info --disable-default-apps --incognito --headless --no-sandbox --disable-dev-shm-usage/' -i /etc/sosse/sosse.conf # add --no-sandbox --disable-dev-shm-usage to chromium's command line
sed -e 's/^.chromium_options=.*/chromium_options=--enable-precise-memory-info --disable-default-apps --incognito --headless --no-sandbox --disable-dev-shm-usage/' -i /etc/sosse/sosse.conf # add --no-sandbox --disable-dev-shm-usage to chromium's command line
sed -e 's/^.browser_crash_retry=.*/browser_crash_retry=3/' -i /etc/sosse/sosse.conf
sed -e 's/^.debug=.*/debug=true/' -i /etc/sosse/sosse.conf
sed -e 's/^.crawler_count=.*/crawler_count=1/' -i /etc/sosse/sosse.conf
/etc/init.d/nginx start
sed -e "s/^.default_browser=.*/default_browser=$(BROWSER)/" -i /etc/sosse/sosse.conf
bash -c 'uwsgi --uid www-data --gid www-data --plugin python3 --ini /etc/sosse/uwsgi.ini --logto /var/log/sosse/uwsgi.log & sudo -u www-data sosse-admin crawl &'
bash ./tests/docker_run.sh docker/pip-test/Dockerfile

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ SOSSE 🦦
SOSSE (Selenium Open Source Search Engine) is a search engine and crawler written in Python, distributed under the [GNU-AGPLv3 license](https://www.gnu.org/licenses/agpl-3.0.en.html). It is hosted on both [Gitlab](https://gitlab.com/biolds1/sosse) and [Github](https://github.com/biolds/sosse) site, please use any of them to open feature requests, bug report or merge requests, or [open a discussion](https://github.com/biolds/sosse/discussions).

SOSSE main features are:
- 🌍 Browser based crawling: the crawler can use [Google Chromium](https://www.chromium.org/Home) and [Selenium](https://www.selenium.dev/) to index pages that use Javascript. [Requests](https://docs.python-requests.org/en/latest/index.html) can also be used for faster crawling
- 🌍 Browser based crawling: the crawler can use [Mozilla Firefox](https://www.mozilla.org/firefox/), or [Google Chromium](https://www.chromium.org/Home) and [Selenium](https://www.selenium.dev/) to index pages that use Javascript. [Requests](https://docs.python-requests.org/en/latest/index.html) can also be used for faster crawling
- 🏖 Low resources requirements: SOSSE is entirely written in Python and uses [PostgreSQL](https://www.postgresql.org/) for data storage
- 🖼 Offline browsing: SOSSE can save HTML copy or take screenshots of crawled pages to create archives suitable for offline browsing
- 🔓 Authentication: the crawlers can submit authentication forms with provided credentials
Expand All @@ -38,7 +38,7 @@ You can try the latest version with Docker:
docker run -p 8005:80 biolds/sosse:latest
```

Connect to port 8005, and log in with user ``admin``, password ``admin``.
Open http://127.0.0.1:8005/, and log in with user ``admin``, password ``admin``.

To persist Docker data, or find alternative installation methods, please check the [documentation](https://sosse.readthedocs.io/en/stable/install.html).

Expand Down
2 changes: 1 addition & 1 deletion debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ Homepage:

Package: sosse
Architecture: any
Depends: ${shlibs:Depends}, ${misc:Depends}, postgresql, nginx, uwsgi, uwsgi-plugin-python3, python3-cssutils, python3-django, python3-requests, python3-bs4, python3-html5lib, python3-psycopg2, python3-django-uwsgi, python3-feedparser, python3-langdetect, python3-pil, python3-publicsuffix2, python3-pygal, python3-lxml, python3-magic, python3-defusedxml, python3-selenium, libjs-jquery, chromium, chromium-driver, fonts-noto
Depends: ${shlibs:Depends}, ${misc:Depends}, postgresql, nginx, uwsgi, uwsgi-plugin-python3, python3-cssutils, python3-django, python3-requests, python3-bs4, python3-html5lib, python3-psycopg2, python3-django-uwsgi, python3-feedparser, python3-langdetect, python3-pil, python3-publicsuffix2, python3-pygal, python3-psutil, python3-lxml, python3-magic, python3-defusedxml, python3-selenium, libjs-jquery, firefox-esr, chromium, chromium-driver, fonts-noto, unifont
Description: Open Source Search Engine
Open Source Search Engine
4 changes: 2 additions & 2 deletions debian/sosse-crawler.service
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ Description=SOSSE crawler
After=syslog.target network.target postgresql.service

[Service]
ExecStartPre=-+mkdir -p /run/sosse /var/log/sosse /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html
ExecStartPre=-+mkdir -p /run/sosse /var/log/sosse /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html /var/www/.mozilla /var/www/.cache
ExecStartPre=-+touch /var/log/sosse/crawler.log /var/log/sosse/debug.log /var/log/sosse/main.log /var/log/sosse/webserver.log
ExecStartPre=-+chown www-data:www-data /run/sosse /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html /var/log/sosse/crawler.log /var/log/sosse/debug.log /var/log/sosse/main.log /var/log/sosse/webserver.log
ExecStartPre=-+chown -R www-data:www-data /run/sosse /var/lib/sosse/ /var/log/sosse/ /var/www/.mozilla /var/www/.cache
ExecStart=/usr/bin/sosse-admin crawl
User=www-data
Group=www-data
Expand Down
4 changes: 2 additions & 2 deletions debian/sosse.postinst
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ case "$1" in
#dbc_generate_include_perms="0660"
#dbc_go webpy-example $@ || true

mkdir -p /run/sosse /var/log/sosse /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html
mkdir -p /run/sosse /var/log/sosse /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html /var/www/.mozilla /var/www/.cache
touch /var/log/sosse/crawler.log /var/log/sosse/debug.log /var/log/sosse/main.log /var/log/sosse/webserver.log
chown www-data:www-data /run/sosse /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html /var/log/sosse/crawler.log /var/log/sosse/debug.log /var/log/sosse/main.log /var/log/sosse/webserver.log
chown www-data:www-data /run/sosse /var/lib/sosse/ /var/log/sosse /var/www/.mozilla /var/www/.cache -R

test -e /etc/sosse/sosse.conf || sosse-admin default_conf > /etc/sosse/sosse.conf
chmod 750 /etc/sosse/
Expand Down
4 changes: 2 additions & 2 deletions doc/source/authentication.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ Authentication can be handled in various way described below:
Submitting forms
----------------

A lot of website are redirecting to a login page when accessing unauthorized content. SOSSE can detect this redirection, fill the login form and submit it before continuing to crawl. This method has the advantage of working on both Chromium and Python Requests, and can handle credential expiration. It can be defined in the :ref:`Crawl policy <authentication_params>`.
A lot of website are redirecting to a login page when accessing unauthorized content. SOSSE can detect this redirection, fill the login form and submit it before continuing to crawl. This method has the advantage of working on both Chromium/Firefox and Python Requests, and can handle credential expiration. It can be defined in the :ref:`Crawl policy <authentication_params>`.

Executing javascript
--------------------

When crawling pages with Chromium, you can ran javascript code to handle any kind of authentication mechanism. See the ``Crawl policy`` :ref:`script parameter <script_params>`.
When crawling pages with Chromium or Firefox, you can ran javascript code to handle any kind of authentication mechanism. See the ``Crawl policy`` :ref:`script parameter <script_params>`.

Cookie Edition
--------------
Expand Down
11 changes: 6 additions & 5 deletions doc/source/crawl/policies.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,9 @@ Default browse mode

Can be one of:

* ``Detect``: the first time a domain is accessed, it is crawled with both Chromium and Python Requests. If the text content varies, it is assumed that the website is dynamic and Chromium will be used for subsequent crawling of pages in this domain. If the text content is the same, Python Request will be used since it is faster.
* ``Detect``: the first time a domain is accessed, it is crawled with a browser and Python Requests. If the text content varies, it is assumed that the website is dynamic and the browser will be used for subsequent crawling of pages in this domain. If the text content is the same, Python Request will be used since it is faster. By default, the browser used is Chromium, this can be changed with the :ref:`default_browser option <conf_option_default_browser>`.
* ``Chromium``: Chromium is used.
* ``Firefox``: Firefox is used.
* ``Python Requests``: Python Requests is used.

.. _policy_create_thumbnails:
Expand All @@ -86,7 +87,7 @@ Create thumbnails
Make thumbnails of pages. These thumbnails are displayed in search results.

.. note::
This option requires the ``Default browse mode`` to be ``Chromium`` in order to work.
This option requires the ``Default browse mode`` to be ``Chromium`` or ``Firefox`` in order to work.

.. _policy_take_screenshot:

Expand All @@ -96,15 +97,15 @@ Take screenshots
Enables taking screenshots of pages for offline use. When the option :ref:`Create thumbnails <policy_create_thumbnails>` is disabled, the screenshot is displayed in search results instead.

.. note::
This option requires the ``Default browse mode`` to be ``Chromium`` in order to work.
This option requires the ``Default browse mode`` to be ``Chromium`` or ``Firefox`` in order to work.

Screenshot format
"""""""""""""""""

Format of the image JPG or PNG.

.. note::
This option requires the ``Default browse mode`` to be ``Chromium`` in order to work.
This option requires the ``Default browse mode`` to be ``Chromium`` or ``Firefox`` in order to work.

Remove navigation related elements
""""""""""""""""""""""""""""""""""
Expand Down Expand Up @@ -133,7 +134,7 @@ For example, the following script could be used to click on a `GDPR <https://en.
In case the script triggers an error, further processing of the page is aborted and the error message is stored in the :ref:`document error field <document_error>`. It can be useful to use a tool such as `Tampermonkey <https://www.tampermonkey.net/>`_ to debug these kind of script.

.. note::
This option requires the ``Default browse mode`` to be ``Chromium`` in order to work.
This option requires the ``Default browse mode`` to be ``Chromium`` or ``Firefox`` in order to work.

.. _policy_html_snapshot:

Expand Down
2 changes: 1 addition & 1 deletion doc/source/domain_settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Browse mode

When the policy's :ref:`Default browse mode <default_browse_params>` is set to ``Detect``, the ``Browse mode`` option of the
domain define which browsing method to use. When its value is ``Detect``, the browsing mode is detected the next time the page
is accessed, and this option is switched to either ``Chromium`` or ``Python Requests``.
is accessed, and this option is switched to either ``Chromium``, ``Firefox`` or ``Python Requests``.

.. _domain_ignore_robots:

Expand Down
13 changes: 13 additions & 0 deletions doc/source/install/debian.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,19 @@ After installing the package, the Nginx site needs to be enabled with:
ln -s /etc/nginx/sites-available/sosse.conf /etc/nginx/sites-enabled/
systemctl restart nginx
Geckodriver setup (optional)
----------------------------

To crawl pages with Firefox, it is required to install `Geckodriver <https://github.com/mozilla/geckodriver/>`_, with the command:

.. code-block:: shell
curl -L https://github.com/mozilla/geckodriver/releases/download/v0.33.0/geckodriver-v0.33.0-linux64.tar.gz | tar -C /usr/local/bin -x -v -z -f -
.. note::
A more recent Geckodriver may improve compatibily with the installed Firefox, though different versions have not been tested to work
correctly with Sosse.

Next steps
----------

Expand Down
6 changes: 4 additions & 2 deletions doc/source/install/pip.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Before installing SOSSE, you'll need to manually install the following softwares
- a web server supporting `WSGI <https://wsgi.readthedocs.io/en/latest/learn.html>`_ (the steps below explains how to setup `Nginx <https://nginx.org/>`_)
- a WSGI server (the steps below explains how to setup `uWSGI <https://uwsgi-docs.readthedocs.io/en/latest/>`_)
- `PostgreSQL <https://www.postgresql.org/>`_
- `Firefox <https://www.mozilla.org/firefox/>`_
- `Geckodriver <https://github.com/mozilla/geckodriver/>`_
- `Google Chromium <https://www.chromium.org/Home>`_
- `ChromeDriver <https://chromedriver.chromium.org/>`_

Expand All @@ -29,9 +31,9 @@ The default configuration and directories can be created with the commands:

.. code-block:: shell
mkdir -p /run/sosse /var/log/sosse /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html
mkdir -p /run/sosse /var/log/sosse /var/www/.cache /var/www/.mozilla /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html
touch /var/log/sosse/crawler.log /var/log/sosse/debug.log /var/log/sosse/main.log /var/log/sosse/webserver.log
chown -R www-data:www-data /run/sosse /var/lib/sosse /var/log/sosse
chown -R www-data:www-data /run/sosse /var/lib/sosse /var/www/.cache /var/www/.mozilla /var/log/sosse
mkdir /etc/sosse
/opt/sosse-venv/bin/sosse-admin default_conf > /etc/sosse/sosse.conf
Expand Down
Loading

0 comments on commit 093a284

Please sign in to comment.