diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 11189c1e31..7cedf3d7ac 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -3,8 +3,8 @@ QuickStart ========== -Run a Scan (no installation required!) --------------------------------------- +Run a Local Directory Scan (no installation required!) +------------------------------------------------------ The **fastest way** to get started and **scan a codebase** — **no installation needed** — is by using the latest @@ -52,8 +52,120 @@ See the :ref:`RUN command ` section for more details on this command. .. note:: Not sure which pipeline to use? Check out :ref:`faq_which_pipeline`. -Next Step: Local Installation ------------------------------ +Run a Remote Package Scan +------------------------- + +Let's look at another example — this time scanning a **remote package archive** by +providing its **download URL**: + +.. code-block:: bash + + docker run --rm \ + ghcr.io/aboutcode-org/scancode.io:latest \ + run scan_single_package https://github.com/aboutcode-org/python-inspector/archive/refs/tags/v0.14.4.zip \ + > results.json + +Let's break down what's happening here: + +- ``docker run --rm`` + Runs a temporary container that is automatically removed after the scan completes. + +- ``ghcr.io/aboutcode-org/scancode.io:latest`` + Uses the latest ScanCode.io image from GitHub Container Registry. + +- ``run scan_single_package `` + Executes the ``scan_single_package`` pipeline, automatically fetching and analyzing + the package archive from the provided URL. + +- ``> results.json`` + Writes the scan results to a local ``results.json`` file. + +Notice that the ``-v "$(pwd)":/codedrop`` option is **not required** in this case +because the input is downloaded directly from the provided URL, rather than coming +from your local filesystem. + +The result? A **complete scan of a remote package archive — no setup, one command!** + +Use PostgreSQL for Better Performance +------------------------------------- + +By default, ScanCode.io uses a **temporary SQLite database** for simplicity. +While this works well for quick scans, it has a few limitations — such as +**no multiprocessing** and slower performance on large codebases. + +For improved speed and scalability, you can run your pipelines using a +**PostgreSQL database** instead. + +Start a PostgreSQL Database Service +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +First, start a PostgreSQL container in the background: + +.. code-block:: bash + + docker run -d \ + --name scancodeio-run-db \ + -e POSTGRES_DB=scancodeio \ + -e POSTGRES_USER=scancodeio \ + -e POSTGRES_PASSWORD=scancodeio \ + -e POSTGRES_INITDB_ARGS="--encoding=UTF-8 --lc-collate=en_US.UTF-8 --lc-ctype=en_US.UTF-8" \ + -v scancodeio_pgdata:/var/lib/postgresql/data \ + -p 5432:5432 \ + postgres:17 + +This command starts a new PostgreSQL service named ``scancodeio-run-db`` and stores its +data in a named Docker volume called ``scancodeio_pgdata``. + +.. note:: + You can stop and remove the PostgreSQL service once you are done using: + + .. code-block:: bash + + docker rm -f scancodeio-run-db + +.. tip:: + The named volume ``scancodeio_pgdata`` ensures that your database data + **persists across runs**. + You can remove it later with ``docker volume rm scancodeio_pgdata`` if needed. + +Run a Docker Image Analysis Using PostgreSQL +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Once PostgreSQL is running, you can start a ScanCode.io pipeline +using the same Docker image, connecting it to the PostgreSQL database container: + +.. code-block:: bash + + docker run --rm \ + --network host \ + -e SCANCODEIO_NO_AUTO_DB=1 \ + ghcr.io/aboutcode-org/scancode.io:latest \ + run analyze_docker_image docker://alpine:3.22.1 \ + > results.json + +Here’s what’s happening: + +- ``--network host`` + Ensures the container can connect to the PostgreSQL service running on your host. + +- ``-e SCANCODEIO_NO_AUTO_DB=1`` + Tells ScanCode.io **not** to create a temporary SQLite database, and instead use + the configured PostgreSQL connection defined in its default settings. + +- ``ghcr.io/aboutcode-org/scancode.io:latest`` + Uses the latest ScanCode.io image from GitHub Container Registry. + +- ``run analyze_docker_image docker://alpine:3.22.1`` + Runs the ``analyze_docker_image`` pipeline, scanning the given Docker image. + +- ``> results.json`` + Saves the scan results to a local ``results.json`` file. + +The result? A **faster, multiprocessing-enabled scan** backed by PostgreSQL — ideal +for large or complex analyses. + +Next Step: Installation +----------------------- Install ScanCode.io, to **unlock all features**: diff --git a/scancodeio/__init__.py b/scancodeio/__init__.py index 75ce16f35c..c689f4c02e 100644 --- a/scancodeio/__init__.py +++ b/scancodeio/__init__.py @@ -106,6 +106,9 @@ def combined_run(): configuration. It combines the creation, execution, and result retrieval of the project into a single process. + + Set SCANCODEIO_NO_AUTO_DB=1 to use the database configuration from the settings + instead of SQLite. """ from django.core.checks.security.base import SECRET_KEY_INSECURE_PREFIX from django.core.management import execute_from_command_line @@ -114,10 +117,12 @@ def combined_run(): os.environ.setdefault("DJANGO_SETTINGS_MODULE", "scancodeio.settings") secret_key = SECRET_KEY_INSECURE_PREFIX + get_random_secret_key() os.environ.setdefault("SECRET_KEY", secret_key) - os.environ.setdefault("SCANCODEIO_DB_ENGINE", "django.db.backends.sqlite3") - os.environ.setdefault("SCANCODEIO_DB_NAME", "scancodeio.sqlite3") - # Disable multiprocessing - os.environ.setdefault("SCANCODEIO_PROCESSES", "0") + + # Default to SQLite unless SCANCODEIO_NO_AUTO_DB is provided + if not os.getenv("SCANCODEIO_NO_AUTO_DB"): + os.environ.setdefault("SCANCODEIO_DB_ENGINE", "django.db.backends.sqlite3") + os.environ.setdefault("SCANCODEIO_DB_NAME", "scancodeio.sqlite3") + os.environ.setdefault("SCANCODEIO_PROCESSES", "0") # Disable multiprocessing sys.argv.insert(1, "run") execute_from_command_line(sys.argv) diff --git a/scanpipe/management/commands/__init__.py b/scanpipe/management/commands/__init__.py index 57b74e1394..ade0a88bb2 100644 --- a/scanpipe/management/commands/__init__.py +++ b/scanpipe/management/commands/__init__.py @@ -284,20 +284,23 @@ def validate_pipelines(pipelines_data): return pipelines_data -def extract_tag_from_input_files(input_files): +def extract_tag_from_input_file(file_location): """ - Add support for the ":tag" suffix in file location. + Parse a file location with optional tag suffix. For example: "/path/to/file.zip:tag" """ - input_files_data = {} - for file in input_files: - if ":" in file: - key, value = file.split(":", maxsplit=1) - input_files_data.update({key: value}) - else: - input_files_data.update({file: ""}) - return input_files_data + if ":" in file_location: + cleaned_location, tag = file_location.split(":", maxsplit=1) + return cleaned_location, tag + return file_location, "" + + +def extract_tag_from_input_files(input_files): + """Parse multiple file locations with optional tag suffixes.""" + return dict( + extract_tag_from_input_file(file_location) for file_location in input_files + ) def validate_input_files(input_files): diff --git a/scanpipe/management/commands/run.py b/scanpipe/management/commands/run.py index 83b5b49082..287902dd7c 100644 --- a/scanpipe/management/commands/run.py +++ b/scanpipe/management/commands/run.py @@ -20,6 +20,7 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. +from collections import defaultdict from pathlib import Path from django.core.management import call_command @@ -27,6 +28,7 @@ from django.core.management.base import CommandError from django.utils.crypto import get_random_string +from scanpipe.management.commands import extract_tag_from_input_file from scanpipe.pipes.fetch import SCHEME_TO_FETCHER_MAPPING @@ -42,12 +44,16 @@ def add_arguments(self, parser): help=( "One or more pipeline to run. " "The pipelines executed based on their given order. " - 'Groups can be provided using the "pipeline_name:option1,option2"' - " syntax." + 'Groups can be provided using the "pipeline_name:option1,option2" ' + "syntax." ), ) parser.add_argument( - "input_location", help="Input location: file, directory, and URL supported." + "input_location", + help=( + "Input location: file, directory, and URL supported." + 'Multiple values can be provided using the "input1,input2" syntax.' + ), ) parser.add_argument("--project", required=False, help="Project name.") parser.add_argument( @@ -68,22 +74,40 @@ def handle(self, *args, **options): "pipeline": pipelines, "execute": True, "verbosity": 0, + **self.get_input_options(input_location), } - if input_location.startswith(tuple(SCHEME_TO_FETCHER_MAPPING.keys())): - create_project_options["input_urls"] = [input_location] - else: - input_path = Path(input_location) - if not input_path.exists(): - raise CommandError(f"{input_location} not found.") - if input_path.is_file(): - create_project_options["input_files"] = [input_location] - else: - create_project_options["copy_codebase"] = input_location - # Run the database migrations in case the database is not created or outdated. call_command("migrate", verbosity=0, interactive=False) # Create a project with proper inputs and execute the pipeline(s) call_command("create-project", project_name, **create_project_options) # Print the results for the specified format on stdout call_command("output", project=project_name, format=[output_format], print=True) + + @staticmethod + def get_input_options(input_location): + """ + Parse a comma-separated list of input locations and convert them into options + for the `create-project` command. + """ + input_options = defaultdict(list) + + for location in input_location.split(","): + if location.startswith(tuple(SCHEME_TO_FETCHER_MAPPING.keys())): + input_options["input_urls"].append(location) + + else: + cleaned_location, _ = extract_tag_from_input_file(location) + input_path = Path(cleaned_location) + if not input_path.exists(): + raise CommandError(f"{location} not found.") + if input_path.is_file(): + input_options["input_files"].append(location) + else: + if input_options["copy_codebase"]: + raise CommandError( + "Only one codebase directory can be provided as input." + ) + input_options["copy_codebase"] = location + + return input_options diff --git a/scanpipe/tests/test_commands.py b/scanpipe/tests/test_commands.py index e2e9e09b70..5c74d56954 100644 --- a/scanpipe/tests/test_commands.py +++ b/scanpipe/tests/test_commands.py @@ -984,6 +984,53 @@ def test_scanpipe_management_command_run(self): self.assertEqual("do_nothing", runs[1]["pipeline_name"]) self.assertEqual(["Group1", "Group2"], runs[1]["selected_groups"]) + @mock.patch("requests.sessions.Session.get") + def test_scanpipe_management_command_run_multiple_inputs(self, mock_get): + source_download_url = "https://example.com/z-source.zip#from" + bin_download_url = "https://example.com/z-bin.zip#to" + mock_get.side_effect = [ + make_mock_response(url=source_download_url), + make_mock_response(url=bin_download_url), + ] + + out = StringIO() + inputs = [ + # copy_codebase option + str(self.data / "codebase"), + # input_files option + str(self.data / "d2d" / "jars" / "from-flume-ng-node-1.9.0.zip"), + str(self.data / "d2d" / "jars" / "to-flume-ng-node-1.9.0.zip"), + # input_urls option + source_download_url, + bin_download_url, + ] + joined_locations = ",".join(inputs) + with redirect_stdout(out): + call_command("run", "download_inputs", joined_locations) + + json_data = json.loads(out.getvalue()) + headers = json_data["headers"] + project_uuid = headers[0]["uuid"] + project = Project.objects.get(uuid=project_uuid) + + expected = [ + "from-flume-ng-node-1.9.0.zip", + "to-flume-ng-node-1.9.0.zip", + "z-bin.zip", + "z-source.zip", + ] + self.assertEqual(expected, sorted(project.input_files)) + + input_sources = headers[0]["input_sources"] + self.assertEqual("z-bin.zip", input_sources[2]["filename"]) + self.assertEqual("to", input_sources[2]["tag"]) + self.assertEqual("z-source.zip", input_sources[3]["filename"]) + self.assertEqual("from", input_sources[3]["tag"]) + + codebase_files = [path.name for path in project.codebase_path.glob("*")] + expected = ["a.txt", "b.txt", "c.txt"] + self.assertEqual(expected, sorted(codebase_files)) + @mock.patch("scanpipe.models.Project.get_latest_output") @mock.patch("requests.post") @mock.patch("requests.sessions.Session.get") @@ -1414,6 +1461,19 @@ def test_scanpipe_management_command_verify_project(self): stdout=out, ) + def test_scanpipe_management_command_extract_tag_from_input_file(self): + extract_tag = commands.extract_tag_from_input_file + expected = ("file.ext", "") + self.assertEqual(expected, extract_tag("file.ext")) + expected = ("file.ext", "") + self.assertEqual(expected, extract_tag("file.ext:")) + expected = ("file.ext", "tag") + self.assertEqual(expected, extract_tag("file.ext:tag")) + expected = ("file.ext", "tag1:tag2") + self.assertEqual(expected, extract_tag("file.ext:tag1:tag2")) + expected = ("file.ext", "tag1,tag2") + self.assertEqual(expected, extract_tag("file.ext:tag1,tag2")) + class ScanPipeManagementCommandMixinTest(TestCase): class CreateProjectCommand(