diff --git a/docs/development/commonconfig.rst b/docs/development/commonconfig.rst new file mode 100644 index 0000000000..e5a6dcdc43 --- /dev/null +++ b/docs/development/commonconfig.rst @@ -0,0 +1,132 @@ +.. index:: commonconfig + +.. _commonconfig-chapter: + + +Common Config +============= + +To avoid repetition between configurations of a half dozen +independently running applications, common settings are consolidated +in a common configuration file: +``OB.../scripts/config/commonconfig.py.dist``. + +All Socorro applications have these constants available to them. For a +Socorro applications that are command line driven, each of these +default values can be overidden by a command line switch of the same +name. + +To setup this configuration file, just copy the example, +``.../scripts/config/commonconfig.py.dist`` to +``.../scripts/config/commonconfig.py``. + +Edit the file for your local situation.:: + + import socorro.lib.ConfigurationManager as cm + import datetime + import stat + + #--------------------------------------------------------------------------- + # Relational Database Section + + databaseHost = cm.Option() + databaseHost.doc = 'the hostname of the database servers' + databaseHost.default = 'localhost' + + databasePort = cm.Option() + databasePort.doc = 'the port of the database on the host' + databasePort.default = 5432 + + databaseName = cm.Option() + databaseName.doc = 'the name of the database within the server' + databaseName.default = '' + + databaseUserName = cm.Option() + databaseUserName.doc = 'the user name for the database servers' + databaseUserName.default = '' + + databasePassword = cm.Option() + databasePassword.doc = 'the password for the database user' + databasePassword.default = '' + + #--------------------------------------------------------------------------- + # Crash storage system + + jsonFileSuffix = cm.Option() + jsonFileSuffix.doc = 'the suffix used to identify a json file' + jsonFileSuffix.default = '.json' + + dumpFileSuffix = cm.Option() + dumpFileSuffix.doc = 'the suffix used to identify a dump file' + dumpFileSuffix.default = '.dump' + + #--------------------------------------------------------------------------- + # HBase storage system + + hbaseHost = cm.Option() + hbaseHost.doc = 'Hostname for hbase hadoop cluster. May be a VIP or load balancer' + hbaseHost.default = 'localhost' + + hbasePort = cm.Option() + hbasePort.doc = 'hbase port number' + hbasePort.default = 9090 + + hbaseTimeout = cm.Option() + hbaseTimeout.doc = 'timeout in milliseconds for an HBase connection' + hbaseTimeout.default = 5000 + + #--------------------------------------------------------------------------- + # misc + + processorCheckInTime = cm.Option() + processorCheckInTime.doc = 'the time after which a processor is considered dead (hh:mm:ss)' + processorCheckInTime.default = "00:05:00" + processorCheckInTime.fromStringConverter = lambda x: str(cm.timeDeltaConverter(x)) + + startWindow = cm.Option() + startWindow.doc = 'The start of the single aggregation window (YYYY-MM-DD [hh:mm:ss])' + startWindow.fromStringConverter = cm.dateTimeConverter + + deltaWindow = cm.Option() + deltaWindow.doc = 'The length of the single aggregation window ([dd:]hh:mm:ss)' + deltaWindow.fromStringConverter = cm.timeDeltaConverter + + defaultDeltaWindow = cm.Option() + defaultDeltaWindow.doc = 'The length of the single aggregation window ([dd:]hh:mm:ss)' + defaultDeltaWindow.fromStringConverter = cm.timeDeltaConverter + + # override this default for your particular cron task + defaultDeltaWindow.default = '00:12:00' + + endWindow = cm.Option() + endWindow.doc = 'The end of the single aggregation window (YYYY-MM-DD [hh:mm:ss])' + endWindow.fromStringConverter = cm.dateTimeConverter + + startDate = cm.Option() + startDate.doc = 'The start of the overall/outer aggregation window (YYYY-MM-DD [hh:mm])' + startDate.fromStringConverter = cm.dateTimeConverter + + deltaDate = cm.Option() + deltaDate.doc = 'The length of the overall/outer aggregation window ([dd:]hh:mm:ss)' + deltaDate.fromStringConverter = cm.timeDeltaConverter + + initialDeltaDate = cm.Option() + initialDeltaDate.doc = 'The length of the overall/outer aggregation window ([dd:]hh:mm:ss)' + initialDeltaDate.fromStringConverter = cm.timeDeltaConverter + + # override this default for your particular cron task + initialDeltaDate.default = '4:00:00:00' + + minutesPerSlot = cm.Option() + minutesPerSlot.doc = 'how many minutes per leaf directory in the date storage branch' + minutesPerSlot.default = 1 + + endDate = cm.Option() + endDate.doc = 'The end of the overall/outer aggregation window (YYYY-MM-DD [hh:mm:ss])' + endDate.fromStringConverter = cm.dateTimeConverter + + debug = cm.Option() + debug.doc = 'do debug output and routines' + debug.default = False + debug.singleCharacter = 'D' + debug.fromStringConverter = cm.booleanConverter diff --git a/docs/development/contributing.rst b/docs/development/contributing.rst index 69ff803abc..48c504f56e 100644 --- a/docs/development/contributing.rst +++ b/docs/development/contributing.rst @@ -19,5 +19,6 @@ Contributing fs database package + commonconfig python-dependencies addaservice diff --git a/docs/development/generalarchitecture.rst b/docs/development/generalarchitecture.rst index e5e8a79a98..f82b231455 100644 --- a/docs/development/generalarchitecture.rst +++ b/docs/development/generalarchitecture.rst @@ -58,6 +58,8 @@ Here are descriptions of every submodule in there: +-------------------+---------------------------------------------------------------+ | database | PostgreSQL related code. | +-------------------+---------------------------------------------------------------+ +| deferredcleanup | Osolete. | ++-------------------+---------------------------------------------------------------+ | external | Here are APIs related to external resources like databases. | +-------------------+---------------------------------------------------------------+ | integrationtest | Osolete. | diff --git a/docs/development/glossary/collector.rst b/docs/development/glossary/collector.rst index b52e6b71cc..46df6619c8 100644 --- a/docs/development/glossary/collector.rst +++ b/docs/development/glossary/collector.rst @@ -23,3 +23,33 @@ system. After a crash is saved, there is an app called :ref:`crashmover-chapter` that will transfer the crashes to HBase. + +Collector Python Configuration +------------------------------ + +Like all the Socorro applications, the configuration is actually +executable Python code. Two configuration files are relevant for +collector + +* Copy ``.../scripts/config/commonconfig.py.dist`` to + `.../config/commonconfig.py`. This configuration file contains + constants used by many of the Socorro applications. +* Copy ``.../scripts/config/collectorconfig.py.dist`` to + ``.../config/collectorconfig.py`` + +Common Configuration +-------------------- + +There are two constants in '.../scripts/config/commonconfig.py' of +interest to collector: `jsonFileSuffix`, and `dumpFileSuffix`. Other +constants in this file are ignored. + +To setup the common configuration, see :ref:`commonconfig-chapter`. + +Collector Configuration +----------------------- + +collectorconfig.py has several options to adjust how files are stored: + +`See sample config code on Github +`_ diff --git a/docs/development/glossary/crashmover.rst b/docs/development/glossary/crashmover.rst index 9c2b554ac9..29f806aa09 100644 --- a/docs/development/glossary/crashmover.rst +++ b/docs/development/glossary/crashmover.rst @@ -7,4 +7,81 @@ Crash Mover The :ref:`collector-chapter` dumps all the crashes that it receives into the local file system. This application is responsible for transferring -those crashes into primary storage, Amazon S3. +those crashes into hbase. + +**Configuration**:: + + import stat + import socorro.lib.ConfigurationManager as cm + + #------------------------------------------------------------------------------- + # general + + numberOfThreads = cm.Option() + numberOfThreads.doc = 'the number of threads to use' + numberOfThreads.default = 4 + + #------------------------------------------------------------------------------- + # source storage + + sourceStorageClass = cm.Option() + sourceStorageClass.doc = 'the fully qualified name of the source storage class' + sourceStorageClass.default = 'socorro.storage.crashstorage.CrashStorageSystemForLocalFS' + sourceStorageClass.fromStringConverter = cm.classConverter + + from config.collectorconfig import localFS + from config.collectorconfig import localFSDumpDirCount + from config.collectorconfig import localFSDumpGID + from config.collectorconfig import localFSDumpPermissions + from config.collectorconfig import localFSDirPermissions + from config.collectorconfig import fallbackFS + from config.collectorconfig import fallbackDumpDirCount + from config.collectorconfig import fallbackDumpGID + from config.collectorconfig import fallbackDumpPermissions + from config.collectorconfig import fallbackDirPermissions + + from config.commonconfig import jsonFileSuffix + from config.commonconfig import dumpFileSuffix + + #------------------------------------------------------------------------------- + # destination storage + + destinationStorageClass = cm.Option() + destinationStorageClass.doc = 'the fully qualified name of the source storage class' + destinationStorageClass.default = 'socorro.storage.crashstorage.CrashStorageSystemForHBase' + destinationStorageClass.fromStringConverter = cm.classConverter + + from config.commonconfig import hbaseHost + from config.commonconfig import hbasePort + from config.commonconfig import hbaseTimeout + + #------------------------------------------------------------------------------- + # logging + + syslogHost = cm.Option() + syslogHost.doc = 'syslog hostname' + syslogHost.default = 'localhost' + + syslogPort = cm.Option() + syslogPort.doc = 'syslog port' + syslogPort.default = 514 + + syslogFacilityString = cm.Option() + syslogFacilityString.doc = 'syslog facility string ("user", "local0", etc)' + syslogFacilityString.default = 'user' + + syslogLineFormatString = cm.Option() + syslogLineFormatString.doc = 'python logging system format for syslog entries' + syslogLineFormatString.default = 'Socorro Storage Mover (pid %(process)d): %(asctime)s %(levelname)s - %(threadName)s - %(message)s' + + syslogErrorLoggingLevel = cm.Option() + syslogErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' + syslogErrorLoggingLevel.default = 10 + + stderrLineFormatString = cm.Option() + stderrLineFormatString.doc = 'python logging system format for logging to stderr' + stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(threadName)s - %(message)s' + + stderrErrorLoggingLevel = cm.Option() + stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' + stderrErrorLoggingLevel.default = 10 diff --git a/docs/development/glossary/deferredcleanup.rst b/docs/development/glossary/deferredcleanup.rst new file mode 100644 index 0000000000..218325af43 --- /dev/null +++ b/docs/development/glossary/deferredcleanup.rst @@ -0,0 +1,116 @@ +.. index:: deferredcleanup + +.. _deferredcleanup-chapter: + + +Deferred Cleanup +================ + +When the :ref:`collector-chapter` throttles the flow of crash dumps, it saves +deferred crashes into :ref:`deferredjobstorage-chapter`. These JSON/dump pairs will +live in deferred storage for a configurable number of days. It is the +task of the deferred cleanup application to implement the policy to +delete old crash dumps. + +The deferred cleanup application is a command line app meant to be run +via as a cron job. It should be set to run once every twenty-four +hours. + +Configuration +------------- + +deferredcleanup uses the common configuration for to get the constant +deferredStorageRoot. For setup of common configuration, see +:ref:`commonconfig-chapter`. + +deferredcleanup also has an executable configuration file of its own. +A sample file is found at +``.../scripts/config/deferredcleanupconfig.py.dist``. Copy this file to +``.../scripts/config/deferredcleanupconfig.py`` and edit it for site +specific settings. + +In each case where a site specific value is desired, replace the value +for the .default member. + +**maximumDeferredJobAge** + +This constant specifies how many days deferred jobs are allowed to +stay in deferred storage. Job deletion is permanent.:: + + maximumDeferredJobAge = cm.Option() + maximumDeferredJobAge.doc = 'the maximum number of days that deferred jobs stick around' + maximumDeferredJobAge.default = 2 + +**dryRun** + +Used during testing and development, this prevents deferredcleanup +from actually deleting things.:: + + dryRun = cm.Option() + dryRun.doc = "don't really delete anything" + dryRun.default = False + dryRun.fromStringConverter = cm.booleanConverter + +**logFilePathname** + +Deferredcleanup can log its actions to a set of automatically rotating +log files. This is the name and location of the logs.:: + + logFilePathname = cm.Option() + logFilePathname.doc = 'full pathname for the log file' + logFilePathname.default = './processor.log' + +**logFileMaximumSize** + +This is the maximum size in bytes allowed for a log file. Once this +number is achieved, the logs rotate and a new log is started.:: + + logFileMaximumSize = cm.Option() + logFileMaximumSize.doc = 'maximum size in bytes of the log file' + logFileMaximumSize.default = 1000000 + +**logFileMaximumBackupHistory** + +The maximum number of log files to keep.:: + + logFileMaximumBackupHistory = cm.Option() + logFileMaximumBackupHistory.doc = 'maximum number of log files to keep' + logFileMaximumBackupHistory.default = 50 + +**logFileLineFormatString** + +A Python format string that controls the format of individual lines in +the logs:: + + logFileLineFormatString = cm.Option() + logFileLineFormatString.doc = 'python logging system format for log file entries' + logFileLineFormatString.default = '%(asctime)s %(levelname)s - %(message)s' + +**logFileErrorLoggingLevel** + +Logging is done in severity levels - the lower the number, the more +verbose the logs.:: + + logFileErrorLoggingLevel = cm.Option() + logFileErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' + logFileErrorLoggingLevel.default = 20 + +**stderrLineFormatString** + +In parallel with creating log files, Monitor can log to stderr. This +is a Python format string that controls the format of individual lines +sent to stderr.:: + + stderrLineFormatString = cm.Option() + stderrLineFormatString.doc = 'python logging system format for logging to stderr' + stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(message)s' + +**stderrErrorLoggingLevel** + +Logging to stderr is done in severity levels independently from the +log file severity levels - the lower the number, the more verbose the +output to stderr.:: + + stderrErrorLoggingLevel = cm.Option() + stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' + stderrErrorLoggingLevel.default = 40 diff --git a/docs/development/glossary/deferredjobstorage.rst b/docs/development/glossary/deferredjobstorage.rst new file mode 100644 index 0000000000..058d02e0fd --- /dev/null +++ b/docs/development/glossary/deferredjobstorage.rst @@ -0,0 +1,22 @@ +.. index:: deferredjobstorage + +.. _deferredjobstorage-chapter: + + +Deferred Job Storage +==================== + +Deferred storage is where the JSON/dump pairs are saved if they've +been filtered out by :ref:`collector-chapter` throttling. The location of the +deferred job storage is determined by the configuration parameter +deferredStorageRoot found in the :ref:`commonconfig-chapter`. + +JSON/dump pairs that are saved in deferred storage are not likely to +ever be processed further. They are held for a configurable number of +days until deleted by :ref:`deferredcleanup-chapter`. + +Occasionally, a developer will request a report via :ref:`reporter-chapter` on +a job that was saved in deferred storage. :ref:`monitor-chapter` will look for +the job in deferred storage if it cannot find it in standard storage. + +For more information on the storage technique, see :ref:`filesystem-chapter` diff --git a/docs/development/glossary/monitor.rst b/docs/development/glossary/monitor.rst new file mode 100644 index 0000000000..23d6299192 --- /dev/null +++ b/docs/development/glossary/monitor.rst @@ -0,0 +1,171 @@ +.. index:: monitor + +.. _monitor-chapter: + + +Monitor +======= + +.. raw:: html + +Monitor is a multithreaded application with several mandates. It's +main job is to find new JSON/dump pairs and queue them for further +processing. It looks for new JSON/dump pairs in the file system +location designated by the constant storageRoot from the +:ref:`commonconfig-chapter` file. Once it finds a pair, it queues them as a +"job" in the database 'jobs' table and assigns it to a specific +processor. Once queued, the monitor goes on to find other new jobs to +queue. + +Monitor also locates and queues priority jobs. If a user requests a +report via the :ref:`reporter-chapter` and that crash report has not yet been +processed, the :ref:`reporter-chapter` puts the requested crash's UUID into +the database's 'priorityjobs' table. Monitor looks in three places for +the requested job: + +* the processors - if monitor finds the job already assigned to a + processor, it raises the priority of that job so the processor will + do it quickly +* the storageRoot file system - if the job is found here, it queues it + for priority processing immediately rather than waiting for standard + mechanism to eventually find it +* the deferredStorageRoot file system - if the requested crash was + filtered out by server side throttling, monitor will find it and + queue it immediately from that location. + +Monitor is also responsible for keeping the StandardJobStorage file +system neat and tidy. It monitors the 'jobs' queue in the database. +Once it sees that a previously queued job has been completed, it moves +the JSON/dump pairs to long term storage or it deletes them (based on +a configuration setting). Jobs that fail their further processing +stage are also either saved in a "failed" storage area or deleted. + +Monitor is a command line application meant to be run continuously as +a daemon. It can log its actions to stderr and/or to automatically +rotating log files. See the configuration options below beginning with +stderr* and logFile* for more information. + +The monitor app is found as ``.../scripts/monitor.py`` In order to run +monitor, the socorro package must be visible somewhere on the python +path. + +Configuration +------------- + +Monitor, like all the Socorro applications, uses the common +configuration for several of its constants. For setup of common +configuration, see :ref:`commonconfig-chapter`. + +monitor also has an executable configuration file of its own. A sample +file is found at ``.../scripts/config/monitorconfig.py.dist``. Copy this +file to .../scripts/config/monitorconfig.py and edit it for site +specific settings. + +In each case where a site specific value is desired, replace the value +for the .default member. + +**standardLoopDelay** + +Monitor has to scan the StandardJobStorage looking for jobs. This +value represents the delay between scans.:: + + standardLoopDelay = cm.Option() + standardLoopDelay.doc = 'the time between scans for jobs (HHH:MM:SS)' + standardLoopDelay.default = '00:05:00' + standardLoopDelay.fromStringConverter = cm.timeDeltaConverter + +**cleanupJobsLoopDelay** + +Monitor archives or deletes JSON/dump pairs from the +StandardJobStorageThis? value represents the delay between runs of the +archive/delete routines.:: + + cleanupJobsLoopDelay = cm.Option() + cleanupJobsLoopDelay.doc = 'the time between runs of the job clean up routines (HHH:MM:SS)' + cleanupJobsLoopDelay.default = '00:05:00' + cleanupJobsLoopDelay.fromStringConverter = cm.timeDeltaConverter + +**priorityLoopDelay** + +The frequency to look for priority jobs.:: + + priorityLoopDelay = cm.Option() + priorityLoopDelay.doc = 'the time between checks for priority jobs (HHH:MM:SS)' + priorityLoopDelay.default = '00:01:00' + priorityLoopDelay.fromStringConverter = cm.timeDeltaConverter + +**saveSuccessfulMinidumpsTo**:: + + saveSuccessfulMinidumpsTo = cm.Option() + saveSuccessfulMinidumpsTo.doc = 'the location for saving successfully processed dumps (leave blank to delete them instead)' + saveSuccessfulMinidumpsTo.default = '/tmp/socorro-sucessful' + +**saveFailedMinidumpsTo**:: + + saveFailedMinidumpsTo = cm.Option() + saveFailedMinidumpsTo.doc = 'the location for saving dumps that failed processing (leave blank to delete them instead)' + saveSuccessfulMinidumpsTo.default = '/tmp/socorro-failed' + +**logFilePathname** + +Monitor can log its actions to a set of automatically rotating log +files. This is the name and location of the logs.:: + + logFilePathname = cm.Option() + logFilePathname.doc = 'full pathname for the log file' + logFilePathname.default = './monitor.log' + +**logFileMaximumSize** + +This is the maximum size in bytes allowed for a log file. Once this +number is achieved, the logs rotate and a new log is started.:: + + logFileMaximumSize = cm.Option() + logFileMaximumSize.doc = 'maximum size in bytes of the log file' + logFileMaximumSize.default = 1000000 + +**logFileMaximumBackupHistory** + +The maximum number of log files to keep.:: + + logFileMaximumBackupHistory = cm.Option() + logFileMaximumBackupHistory.doc = 'maximum number of log files to keep' + logFileMaximumBackupHistory.default = 50 + +**logFileLineFormatString** + +A Python format string that controls the format of individual lines in +the logs:: + + logFileLineFormatString = cm.Option() + logFileLineFormatString.doc = 'python logging system format for log file entries' + logFileLineFormatString.default = '%(asctime)s %(levelname)s - %(message)s' + +**logFileErrorLoggingLevel** + +Logging is done in severity levels - the lower the number, the more +verbose the logs.:: + + logFileErrorLoggingLevel = cm.Option() + logFileErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' + logFileErrorLoggingLevel.default = 10 + +**stderrLineFormatString** + +In parallel with creating log files, Monitor can log to stderr. This +is a Python format string that controls the format of individual lines +sent to stderr.:: + + stderrLineFormatString = cm.Option() + stderrLineFormatString.doc = 'python logging system format for logging to stderr' + stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(message)s' + +**stderrErrorLoggingLevel** + +Logging to stderr is done in severity levels independently from the +log file severity levels - the lower the number, the more verbose the +output to stderr.:: + + stderrErrorLoggingLevel = cm.Option() + stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' + stderrErrorLoggingLevel.default = 40 diff --git a/docs/development/glossary/standardjobstorage.rst b/docs/development/glossary/standardjobstorage.rst new file mode 100644 index 0000000000..03a2a04b1e --- /dev/null +++ b/docs/development/glossary/standardjobstorage.rst @@ -0,0 +1,21 @@ +.. index:: standardjobstorage + +.. _standardjobstorage-chapter: + + +Standard Job Storage +==================== + +Standard storage is where the JSON/dump pairs are saved while they +wait for processing. The location of the standard storage is +determined by the configuration parameter storageRoot found in the +:ref:`commonconfig-chapter`. + +The file system is divided into two parts: date based storage and name +based storage. Both branches use a radix sort breakdown to locate +files. The original version of Socorro used only the date based +storage, but it was found to be too slow to search when under a heavy +load. + +For a deeper discussion of the storage technique: see +:ref:`filesystem-chapter` diff --git a/docs/development/processor.rst b/docs/development/processor.rst index 337aaba10d..e51227a7ad 100644 --- a/docs/development/processor.rst +++ b/docs/development/processor.rst @@ -14,3 +14,6 @@ output, and records the results in the hbase. The processor, coupled with stackwalk_server, is computationally intensive. Multiple instances of the processor can be run simultaneously from different machines. + +`See sample config code on Github +`_ diff --git a/docs/development/reviewprocess.rst b/docs/development/reviewprocess.rst index 27cfa0aa59..78e94dc09c 100644 --- a/docs/development/reviewprocess.rst +++ b/docs/development/reviewprocess.rst @@ -66,7 +66,8 @@ To run the unit tests in a Vagrant VM, do the following:: make test This installs all the dependencies needed and run all the tests. You need to -have a running PostgreSQL instance for this to work. +have a running PostgreSQL instance for this to work, with a specific config +file for the tests in ``socorro/unittest/config/commonconfig.py``. For further documentation on unit tests, please read :ref:`unittesting-chapter`. diff --git a/docs/development/unittesting.rst b/docs/development/unittesting.rst index 24e67492d1..7224ac8797 100644 --- a/docs/development/unittesting.rst +++ b/docs/development/unittesting.rst @@ -411,3 +411,34 @@ readability:: """A brief description about this test.""" assert True + +............... + +Old instructions (What is important about it?) + +* We must either provide for a postgreql account with name and + password that matches the config file or edit the test config file + to provide an appropriate test account and password. That file is + socorro/unittest/config/commonconfig.py. If you add a new test + config file that needs database access, you should import the + details from commonconfig, as exemplified in the existing config + files. +* We must provide a a database appropriate for the test user + (default: test. That database must support PLPGSQL. As the owner of + the test database, while connected to that database, invoke ``CREATE + LANGUAGE PLPGSQL;`` + +* What is red? + + Short for ``redo`` or ``do it again``. There is a bash shell file + called ``socorro/unittest/red`` which may sourced to provide a bash + function called ``red`` that simplifies watching test logfiles in a + separate terminal window. In that window, cd to the unittest + sub-directory of interest, then source the file: . ../red, then call + ``red``. The effect is to clear the screen, then tail -F the logfile + associated with tests in that directory. You may chant red --help to + be reminded. + + The red file also provides a function noseErrors which simplifies + the examination of nosetests output. Chant noseErrors --help for a + brief summary. diff --git a/scripts/config/collectorconfig.py.dist b/scripts/config/collectorconfig.py.dist new file mode 100644 index 0000000000..04d74ef70b --- /dev/null +++ b/scripts/config/collectorconfig.py.dist @@ -0,0 +1,193 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# This is the common configuration file for the Socorro mod-wsgi Collector. +# +# Parameters are consist of three or four lines of Python code. The first line +# defines the parameter: + +# someParameter = cm.Option() + +# The second line provides one line of documentation about the parameter. +# This line is also used to as the text in the '--help' option invoked from the +# command line. + +# someParameter.doc = 'this is what this parameter is about' + +# The third line is the default value for the parameter. Adjust this line as +# necessary. This value will be overridden if the user specifies a different +# value on the command line. + +# someParameter.default = '00:00:30' + +# The option fourth line specifies a Python function (or callable) that will +# serve to take a text version of the parameter value and turn it into the +# proper type. This is useful for things like taking a textual datetime value +# and turning them into a real datetime value. The ConfigurationManager module +# offers a number of converters: dateTimeConverter, timeDeltaConverter, +# booleanConverter. If the value of 'default' is a fundamental type like int +# or float, it is not necessary to provide a fromStringConverter. + +# someParameter.fromStringConverter = ConfigurationManager.dateTimeConverter +#------------------------------------------------------------------------------- + +import stat +import re + +import socorro.lib.ConfigurationManager as cm + +#------------------------------------------------------------------------------- +# Storage constants + +from config.commonconfig import jsonFileSuffix +from config.commonconfig import dumpFileSuffix + +#------------------------------------------------------------------------------- +# Storage constants + +primaryStorageClass = cm.Option() +primaryStorageClass.doc = 'the name of the class for primary storage ' \ + '(socorro.storage.crashstorage.CrashStorageSystemForLocalFS or' \ + ' socorro.storage.crashstorage.CollectorCrashStorageSystemForHBase)' +primaryStorageClass.default = 'socorro.storage.crashstorage.CrashStorageSystemForLocalFS' +primaryStorageClass.fromStringConverter = cm.classConverter + +localFS = cm.Option() +localFS.doc = 'a path to a local file system' +localFS.default = '/home/socorro/primaryCrashStore' + +localFSDumpDirCount = cm.Option() +localFSDumpDirCount.doc = 'the number of dumps to be stored in a single directory in the local file system' +localFSDumpDirCount.default = 1024 + +localFSDumpGID = cm.Option() +localFSDumpGID.doc="the group ID for saved crashes in local file system (optional)" +localFSDumpGID.default = None + +localFSDumpPermissions = cm.Option() +localFSDumpPermissions.doc = "a number used for permissions crash dump files in the local file system" +localFSDumpPermissions.default = stat.S_IRGRP | stat.S_IWGRP | stat.S_IRUSR | stat.S_IWUSR + +localFSDirPermissions = cm.Option() +localFSDirPermissions.doc = "a number used for permissions for directories in the local file system" +localFSDirPermissions.default = stat.S_IRGRP | stat.S_IXGRP | stat.S_IWGRP | stat.S_IRUSR | stat.S_IXUSR | stat.S_IWUSR + +fallbackFS = cm.Option() +fallbackFS.doc = 'a path to a local file system to use if local store fails' +fallbackFS.default = '/home/socorro/fallback' + +fallbackDumpDirCount = cm.Option() +fallbackDumpDirCount.doc = 'the number of dumps to be stored in a single directory in the fallback File System' +fallbackDumpDirCount.default = 1024 + +fallbackDumpGID = cm.Option() +fallbackDumpGID.doc="the group ID for saved crashes in fallback File System (optional)" +fallbackDumpGID.default = None + +fallbackDumpPermissions = cm.Option() +fallbackDumpPermissions.doc = "a number used for permissions crash dump files in the fallback File System" +fallbackDumpPermissions.default = stat.S_IRGRP | stat.S_IWGRP | stat.S_IRUSR | stat.S_IWUSR + +fallbackDirPermissions = cm.Option() +fallbackDirPermissions.doc = "a number used for permissions for directories in the fallback File System" +fallbackDirPermissions.default = stat.S_IRGRP | stat.S_IXGRP | stat.S_IWGRP | stat.S_IRUSR | stat.S_IXUSR | stat.S_IWUSR + +from config.commonconfig import hbaseHost +from config.commonconfig import hbasePort +from config.commonconfig import hbaseTimeout +hbaseFallbackFS = fallbackFS +hbaseFallbackDumpDirCount = fallbackDumpDirCount +hbaseFallbackDumpGID = fallbackDumpGID +hbaseFallbackDumpPermissions = fallbackDumpPermissions +hbaseFallbackDirPermissions = fallbackDirPermissions + +#------------------------------------------------------------------------------- +# application server parameters + +modwsgiInstallation = cm.Option() +modwsgiInstallation.doc = 'True or False, this app is installed under mod_wsgi' +modwsgiInstallation.default = True + +serverIPAddress = cm.Option() +serverIPAddress.doc = 'the IP address from which to accept submissions if not installed under mod_wsgi' +serverIPAddress.default = '127.0.0.1' + +serverPort = cm.Option() +serverPort.doc = 'the port to listen to for submissions if not installed under mod_wsgi' +serverPort.default = 8882 + +# The form field the client sends the dump in +dumpField = cm.Option() +dumpField.default = "upload_file_minidump" + +# when storing in the file system, how deep should the radix directory depth be +storageDepth = cm.Option() +storageDepth.default = 2 + +# Returned to the client with a uuid following +dumpIDPrefix = cm.Option() +dumpIDPrefix.default = "bp-" + +# Bugzilla 495700 - need to be able to ignore new Thottleable protocol +neverDiscard = cm.Option() +neverDiscard.default = True +neverDiscard.fromStringConverter = cm.booleanConverter + +throttleConditions = cm.Option() +throttleConditions.default = throttleConditions.default = [ + ("*", lambda d: "HangID" in d and d.get("ProcessType", "browser") == "browser", None), # drop browser hangs + ("Comments", lambda x: x, 100), # 100% of crashes with comments + ("Email", lambda x: x, 100), # 100% of crashes with email address + ("ReleaseChannel", lambda x: x in ("aurora", "beta", "esr"), 100), + ("ReleaseChannel", lambda x: x.startswith('nightly'), 100), + ("ProductName", 'Firefox', 10), # 10% of Firefox + ("ProductName", 'Fennec', 100), # 100% of Fennec + ("Version", re.compile(r'\..*?[a-zA-Z]+'), 100), # 100% of all alpha, beta or special + ("ProductName", lambda x: x[0] in 'TSC', 100), # 100% of Thunderbird & SeaMonkey + (None, True, 0) # reject everything else +] + + +minimalVersionForUnderstandingRefusal = cm.Option() +minimalVersionForUnderstandingRefusal.default = { 'Firefox': '3.5.4' } + +benchmark = cm.Option() +benchmark.default = False +benchmark.fromStringConverter = cm.booleanConverter + + +#------------------------------------------------------------------------------- +# storage + + +#------------------------------------------------------------------------------- +# Logging + +syslogHost = cm.Option() +syslogHost.doc = 'syslog hostname' +syslogHost.default = 'localhost' + +syslogPort = cm.Option() +syslogPort.doc = 'syslog port' +syslogPort.default = 514 + +syslogSocket = cm.Option() +syslogSocket.doc = 'syslog local socket' +syslogSocket.default = '/dev/log' + +syslogTransport = cm.Option() +syslogTransport.doc = 'syslog transport method ("socket", "udp")' +syslogTransport.default = 'socket' + +syslogFacilityString = cm.Option() +syslogFacilityString.doc = 'syslog facility string ("user", "local0", etc)' +syslogFacilityString.default = 'user' + +syslogLineFormatString = cm.Option() +syslogLineFormatString.doc = 'python logging system format for syslog entries' +syslogLineFormatString.default = 'Socorro Collector (pid %(process)d): %(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +syslogErrorLoggingLevel = cm.Option() +syslogErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +syslogErrorLoggingLevel.default = 10 diff --git a/scripts/config/commonconfig.py.dist b/scripts/config/commonconfig.py.dist new file mode 100644 index 0000000000..49fa9a9b2d --- /dev/null +++ b/scripts/config/commonconfig.py.dist @@ -0,0 +1,203 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import socorro.lib.ConfigurationManager as cm +import datetime +import stat + +#--------------------------------------------------------------------------- +# Relational Database Section + +databaseHost = cm.Option() +databaseHost.doc = 'the hostname of the database servers' +databaseHost.default = 'localhost' + +databasePort = cm.Option() +databasePort.doc = 'the port of the database on the host' +databasePort.default = 5432 + +databaseName = cm.Option() +databaseName.doc = 'the name of the database within the server' +databaseName.default = 'breakpad' + +databaseUserName = cm.Option() +databaseUserName.doc = 'the user name for the database servers' +databaseUserName.default = 'breakpad_rw' + +databasePassword = cm.Option() +databasePassword.doc = 'the password for the database user' +databasePassword.default = 'aPassword' + +databaseTempbuffers = cm.Option() +databaseTempbuffers.doc = 'temp_buffers setting for use by matviews' +databaseTempbuffers.default = '512MB' + +# This is a set of transition parameters to support deprecation +database_hostname = cm.Option() +database_hostname.doc = 'the hostname of the database servers' +database_hostname.default = 'localhost' + +database_port = cm.Option() +database_port.doc = 'the port of the database on the host' +database_port.default = 5432 + +database_name = cm.Option() +database_name.doc = 'the name of the database within the server' +database_name.default = 'breakpad' + +database_username = cm.Option() +database_username.doc = 'the user name for the database servers' +database_username.default = 'breakpad_rw' + +database_password = cm.Option() +database_password.doc = 'the password for the database user' +database_password.default = 'aPassword' + +#--------------------------------------------------------------------------- +# RabbitMQ config + +rabbitMQUsername = cm.Option() +rabbitMQUsername.doc = 'the username of the rabbitmq user' +rabbitMQUsername.default = 'guest' + +rabbitMQPassword = cm.Option() +rabbitMQPassword.doc = 'the password of the rabbitmq user' +rabbitMQPassword.default = 'guest' + +rabbitMQHost = cm.Option() +rabbitMQHost.doc = 'the hostname of the rabbitmq service' +rabbitMQHost.default = 'localhost' + +rabbitMQPort = cm.Option() +rabbitMQPort.doc = 'the port of the rabbitmq service' +rabbitMQPort.default = 5672 + +rabbitMQVirtualhost = cm.Option() +rabbitMQVirtualhost.doc = 'the virtual host for rabbitmq' +rabbitMQVirtualhost.default = '/' + + +rabbitMQStandardQueue = cm.Option() +rabbitMQStandardQueue.doc = 'the standard queue for rabbitmq' +rabbitMQStandardQueue.default = 'socorro.normal' + +rabbitMQPriorityQueue = cm.Option() +rabbitMQPriorityQueue.doc = 'the standard queue for rabbitmq' +rabbitMQPriorityQueue.default = 'socorro.priority' + +#--------------------------------------------------------------------------- +# statsd config + +statsdHost = cm.Option() +statsdHost.doc = '' +statsdHost.default = 'localhost' + +statsdPort = cm.Option() +statsdPort.doc = '8125' +statsdPort.default = 8125 + +statsdPrefix = cm.Option() +statsdPrefix.doc = '' +statsdPrefix.default = None + +#--------------------------------------------------------------------------- +# Crash storage system + +jsonFileSuffix = cm.Option() +jsonFileSuffix.doc = 'the suffix used to identify a json file' +jsonFileSuffix.default = '.json' + +dumpFileSuffix = cm.Option() +dumpFileSuffix.doc = 'the suffix used to identify a dump file' +dumpFileSuffix.default = '.dump' + +#--------------------------------------------------------------------------- +# HBase storage system + +hbaseStorageClass = cm.Option() +hbaseStorageClass.doc = 'the Socorro classname for HBase storage' +hbaseStorageClass.default = 'socorro.storage.crashstorage.CrashStorageSystemForHBase' +#hbaseStorageClass.default = 'socorro.storage.crashstorage.DualHbaseCrashStorageSystem' +hbaseStorageClass.fromStringConverter = cm.classConverter + +hbaseHost = cm.Option() +hbaseHost.doc = 'Hostname for hbase hadoop cluster. May be a VIP or load balancer' +hbaseHost.default = 'crash-stats' + +hbasePort = cm.Option() +hbasePort.doc = 'hbase port number' +hbasePort.default = 9090 + +hbaseTimeout = cm.Option() +hbaseTimeout.doc = 'timeout in milliseconds for an HBase connection' +hbaseTimeout.default = 5000 + +secondaryHbaseHost = cm.Option() +secondaryHbaseHost.doc = 'Hostname for hbase hadoop cluster. May be a VIP or load balancer' +secondaryHbaseHost.default = 'localhost' + +secondaryHbasePort = cm.Option() +secondaryHbasePort.doc = 'hbase port number' +secondaryHbasePort.default = 9090 + +secondaryHbaseTimeout = cm.Option() +secondaryHbaseTimeout.doc = 'timeout in milliseconds for an HBase connection' +secondaryHbaseTimeout.default = 5000 + +#--------------------------------------------------------------------------- +# misc + +processorCheckInTime = cm.Option() +processorCheckInTime.doc = 'the time after which a processor is considered dead (hh:mm:ss)' +processorCheckInTime.default = "00:05:00" +processorCheckInTime.fromStringConverter = lambda x: str(cm.timeDeltaConverter(x)) + +startWindow = cm.Option() +startWindow.doc = 'The start of the single aggregation window (YYYY-MM-DD [hh:mm:ss])' +startWindow.fromStringConverter = cm.dateTimeConverter + +deltaWindow = cm.Option() +deltaWindow.doc = 'The length of the single aggregation window ([dd:]hh:mm:ss)' +deltaWindow.fromStringConverter = cm.timeDeltaConverter + +defaultDeltaWindow = cm.Option() +defaultDeltaWindow.doc = 'The length of the single aggregation window ([dd:]hh:mm:ss)' +defaultDeltaWindow.fromStringConverter = cm.timeDeltaConverter + +# override this default for your particular cron task +defaultDeltaWindow.default = '00:12:00' + +endWindow = cm.Option() +endWindow.doc = 'The end of the single aggregation window (YYYY-MM-DD [hh:mm:ss])' +endWindow.fromStringConverter = cm.dateTimeConverter + +startDate = cm.Option() +startDate.doc = 'The start of the overall/outer aggregation window (YYYY-MM-DD [hh:mm])' +startDate.fromStringConverter = cm.dateTimeConverter + +deltaDate = cm.Option() +deltaDate.doc = 'The length of the overall/outer aggregation window ([dd:]hh:mm:ss)' +deltaDate.fromStringConverter = cm.timeDeltaConverter + +initialDeltaDate = cm.Option() +initialDeltaDate.doc = 'The length of the overall/outer aggregation window ([dd:]hh:mm:ss)' +initialDeltaDate.fromStringConverter = cm.timeDeltaConverter + +# override this default for your particular cron task +initialDeltaDate.default = '4:00:00:00' + +minutesPerSlot = cm.Option() +minutesPerSlot.doc = 'how many minutes per leaf directory in the date storage branch' +minutesPerSlot.default = 1 + +endDate = cm.Option() +endDate.doc = 'The end of the overall/outer aggregation window (YYYY-MM-DD [hh:mm:ss])' +endDate.fromStringConverter = cm.dateTimeConverter + +debug = cm.Option() +debug.doc = 'do debug output and routines' +debug.default = False +debug.singleCharacter = 'D' +debug.fromStringConverter = cm.booleanConverter + diff --git a/scripts/config/crashmoverconfig.py.dist b/scripts/config/crashmoverconfig.py.dist new file mode 100644 index 0000000000..a7e7c49f40 --- /dev/null +++ b/scripts/config/crashmoverconfig.py.dist @@ -0,0 +1,79 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import stat +import socorro.lib.ConfigurationManager as cm + +#------------------------------------------------------------------------------- +# general + +numberOfThreads = cm.Option() +numberOfThreads.doc = 'the number of threads to use' +numberOfThreads.default = 4 + +#------------------------------------------------------------------------------- +# source storage + +sourceStorageClass = cm.Option() +sourceStorageClass.doc = 'the fully qualified name of the source storage class' +sourceStorageClass.default = 'socorro.storage.crashstorage.CrashStorageSystemForLocalFS' +sourceStorageClass.fromStringConverter = cm.classConverter + +from config.collectorconfig import localFS +from config.collectorconfig import localFSDumpDirCount +from config.collectorconfig import localFSDumpGID +from config.collectorconfig import localFSDumpPermissions +from config.collectorconfig import localFSDirPermissions +from config.collectorconfig import fallbackFS +from config.collectorconfig import fallbackDumpDirCount +from config.collectorconfig import fallbackDumpGID +from config.collectorconfig import fallbackDumpPermissions +from config.collectorconfig import fallbackDirPermissions + +from config.commonconfig import jsonFileSuffix +from config.commonconfig import dumpFileSuffix + +#------------------------------------------------------------------------------- +# destination storage + +destinationStorageClass = cm.Option() +destinationStorageClass.doc = 'the fully qualified name of the source storage class' +destinationStorageClass.default = 'socorro.storage.crashstorage.CrashStorageSystemForHBase' +destinationStorageClass.fromStringConverter = cm.classConverter + +from config.commonconfig import hbaseHost +from config.commonconfig import hbasePort +from config.commonconfig import hbaseTimeout + +#------------------------------------------------------------------------------- +# logging + +syslogHost = cm.Option() +syslogHost.doc = 'syslog hostname' +syslogHost.default = 'localhost' + +syslogPort = cm.Option() +syslogPort.doc = 'syslog port' +syslogPort.default = 514 + +syslogFacilityString = cm.Option() +syslogFacilityString.doc = 'syslog facility string ("user", "local0", etc)' +syslogFacilityString.default = 'user' + +syslogLineFormatString = cm.Option() +syslogLineFormatString.doc = 'python logging system format for syslog entries' +syslogLineFormatString.default = 'Socorro Storage Mover (pid %(process)d): %(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +syslogErrorLoggingLevel = cm.Option() +syslogErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +syslogErrorLoggingLevel.default = 10 + +stderrLineFormatString = cm.Option() +stderrLineFormatString.doc = 'python logging system format for logging to stderr' +stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +stderrErrorLoggingLevel = cm.Option() +stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +stderrErrorLoggingLevel.default = 10 + diff --git a/scripts/config/dailyurlconfig.py.dist b/scripts/config/dailyurlconfig.py.dist new file mode 100644 index 0000000000..389a2dcca4 --- /dev/null +++ b/scripts/config/dailyurlconfig.py.dist @@ -0,0 +1,73 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import socorro.lib.ConfigurationManager as cm +import datetime +import re + +try: + from config.commonconfig import database_hostname + from config.commonconfig import database_name + from config.commonconfig import database_username + from config.commonconfig import database_password +except ImportError: + from commonconfig import database_hostname + from commonconfig import database_name + from commonconfig import database_username + from commonconfig import database_password + +from socorro.lib.datetimeutil import utc_now + +day = cm.Option() +day.doc = 'the date to dump (YYYY-MM-DD)' +day.default = utc_now().date() +day.fromStringConverter = cm.dateTimeConverter + +outputPath = cm.Option() +outputPath.doc = "file system location to put the 'internal/private' output csv file" +outputPath.default = '.' + +publicOutputPath = cm.Option() +publicOutputPath.doc = "file system location to put the 'external/public' output csv file" +publicOutputPath.default = '.' + +product = cm.Option() +product.doc = 'a comma delimited list of the products to track (leave blank for all)' +product.default = 'Firefox' + +version = cm.Option() +version.doc = 'a comma delimited list of the versions to track (leave blank for all)' +version.default = '' + +#------------------------------------------------------------------------------- +# Logging + +syslogHost = cm.Option() +syslogHost.doc = 'syslog hostname' +syslogHost.default = 'localhost' + +syslogPort = cm.Option() +syslogPort.doc = 'syslog port' +syslogPort.default = 514 + +syslogFacilityString = cm.Option() +syslogFacilityString.doc = 'syslog facility string ("user", "local0", etc)' +syslogFacilityString.default = 'user' + +syslogLineFormatString = cm.Option() +syslogLineFormatString.doc = 'python logging system format for syslog entries' +syslogLineFormatString.default = 'Socorro Daily URL (pid %(process)d): %(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +syslogErrorLoggingLevel = cm.Option() +syslogErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +syslogErrorLoggingLevel.default = 10 + +stderrLineFormatString = cm.Option() +stderrLineFormatString.doc = 'python logging system format for logging to stderr' +stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(message)s' + +stderrErrorLoggingLevel = cm.Option() +stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +stderrErrorLoggingLevel.default = 10 + diff --git a/scripts/config/fixbrokendumpsconfig.py.dist b/scripts/config/fixbrokendumpsconfig.py.dist new file mode 100644 index 0000000000..3b4f241146 --- /dev/null +++ b/scripts/config/fixbrokendumpsconfig.py.dist @@ -0,0 +1,84 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import socorro.lib.ConfigurationManager as cm +import datetime + +from config.commonconfig import databaseHost +from config.commonconfig import databaseName +from config.commonconfig import databaseUserName +from config.commonconfig import databasePassword + +from config.commonconfig import hbaseHost +from config.commonconfig import hbasePort +from config.commonconfig import hbaseTimeout + +brokenFirefoxLinuxQuery = """ + SELECT uuid,date_processed FROM reports WHERE product = 'Firefox' + AND (version = '4.0b11' OR version = '4.0b12') + AND os_name = 'Linux' + AND date_processed > '%s' + AND date_processed < (now() - INTERVAL '30 minutes') + ORDER BY date_processed +""" +brokenFirefoxLinuxFixer = './minidump_hack-firefox_linux' + +brokenFennecQuery = """ + SELECT uuid,date_processed FROM reports WHERE product = 'Fennec' + AND version = '4.0b5' + AND date_processed > '%s' + AND date_processed < (now() - INTERVAL '30 minutes') + ORDER BY date_processed +""" +brokenFennecFixer = './minidump_hack-fennec' + +brokenBoot2GeckoQuery = """ + SELECT uuid,date_processed FROM reports WHERE product = 'B2G' + AND os_name = 'Android' + AND date_processed > '%s' + AND date_processed < (now() - INTERVAL '30 minutes') + ORDER BY date_processed +""" +brokenBoot2GeckoFixer = './minidump_hack-b2g' + +persistentBrokenDumpPathname = cm.Option() +persistentBrokenDumpPathname.doc = 'a pathname to a file system location where this script can store persistent data' +persistentBrokenDumpPathname.default = './fixbrokendumps.pickle' + +daysIntoPast = cm.Option() +daysIntoPast.doc = 'number of days to look into the past for broken crashes (0 - use last run time)' +daysIntoPast.default = 30 + +#------------------------------------------------------------------------------- +# Logging + +syslogHost = cm.Option() +syslogHost.doc = 'syslog hostname' +syslogHost.default = 'localhost' + +syslogPort = cm.Option() +syslogPort.doc = 'syslog port' +syslogPort.default = 514 + +syslogFacilityString = cm.Option() +syslogFacilityString.doc = 'syslog facility string ("user", "local0", etc)' +syslogFacilityString.default = 'user' + +syslogLineFormatString = cm.Option() +syslogLineFormatString.doc = 'python logging system format for syslog entries' +syslogLineFormatString.default = 'Socorro Fix Broken Dumps (pid %(process)d): %(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +syslogErrorLoggingLevel = cm.Option() +syslogErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +syslogErrorLoggingLevel.default = 10 + +stderrLineFormatString = cm.Option() +stderrLineFormatString.doc = 'python logging system format for logging to stderr' +stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(message)s' + +stderrErrorLoggingLevel = cm.Option() +stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +stderrErrorLoggingLevel.default = 40 + + diff --git a/scripts/config/monitorconfig.py.dist b/scripts/config/monitorconfig.py.dist new file mode 100644 index 0000000000..f3831fbc2d --- /dev/null +++ b/scripts/config/monitorconfig.py.dist @@ -0,0 +1,74 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import socorro.lib.ConfigurationManager as cm +import datetime +import stat + +#--------------------------------------------------------------------------- +# imported config + +from config.commonconfig import databaseHost +from config.commonconfig import databaseName +from config.commonconfig import databasePort +from config.commonconfig import databaseUserName +from config.commonconfig import databasePassword +from config.commonconfig import processorCheckInTime + +#--------------------------------------------------------------------------- +# HBase storage system +from config.commonconfig import hbaseHost +from config.commonconfig import hbasePort +from config.commonconfig import hbaseTimeout + +#--------------------------------------------------------------------------- +# monitor local config + +standardLoopDelay = cm.Option() +standardLoopDelay.doc = 'the time between scans for jobs (HHH:MM:SS)' +standardLoopDelay.default = '00:05:00' +standardLoopDelay.fromStringConverter = cm.timeDeltaConverter + +cleanupJobsLoopDelay = cm.Option() +cleanupJobsLoopDelay.doc = 'the time between runs of the job clean up routines (HHH:MM:SS)' +cleanupJobsLoopDelay.default = '00:05:00' +cleanupJobsLoopDelay.fromStringConverter = cm.timeDeltaConverter + +priorityLoopDelay = cm.Option() +priorityLoopDelay.doc = 'the time between checks for priority jobs (HHH:MM:SS)' +priorityLoopDelay.default = '00:01:00' +priorityLoopDelay.fromStringConverter = cm.timeDeltaConverter + +#------------------------------------------------------------------------------- +# Logging + +syslogHost = cm.Option() +syslogHost.doc = 'syslog hostname' +syslogHost.default = 'localhost' + +syslogPort = cm.Option() +syslogPort.doc = 'syslog port' +syslogPort.default = 514 + +syslogFacilityString = cm.Option() +syslogFacilityString.doc = 'syslog facility string ("user", "local0", etc)' +syslogFacilityString.default = 'user' + +syslogLineFormatString = cm.Option() +syslogLineFormatString.doc = 'python logging system format for syslog entries' +syslogLineFormatString.default = 'Socorro Monitor (pid %(process)d): %(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +syslogErrorLoggingLevel = cm.Option() +syslogErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +syslogErrorLoggingLevel.default = 10 + +stderrLineFormatString = cm.Option() +stderrLineFormatString.doc = 'python logging system format for logging to stderr' +stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +stderrErrorLoggingLevel = cm.Option() +stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +stderrErrorLoggingLevel.default = 10 + + diff --git a/scripts/config/orphansubmitterconf.py.dist b/scripts/config/orphansubmitterconf.py.dist new file mode 100644 index 0000000000..4fec11d383 --- /dev/null +++ b/scripts/config/orphansubmitterconf.py.dist @@ -0,0 +1,87 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import stat +import socorro.lib.ConfigurationManager as cm + +#------------------------------------------------------------------------------- +# source storage + +sourceStorageClass = cm.Option() +sourceStorageClass.doc = 'the fully qualified name of the source storage class' +sourceStorageClass.default = 'socorro.storage.crashstorage.CrashStorageSystemForLocalFS' +sourceStorageClass.fromStringConverter = cm.classConverter + +from config.collectorconfig import localFS +from config.collectorconfig import localFSDumpDirCount +from config.collectorconfig import localFSDumpGID +from config.collectorconfig import localFSDumpPermissions +from config.collectorconfig import localFSDirPermissions +from config.collectorconfig import fallbackFS +from config.collectorconfig import fallbackDumpDirCount +from config.collectorconfig import fallbackDumpGID +from config.collectorconfig import fallbackDumpPermissions +from config.collectorconfig import fallbackDirPermissions + +from config.commonconfig import jsonFileSuffix +from config.commonconfig import dumpFileSuffix + +#------------------------------------------------------------------------------- +# destination storage + +destinationStorageClass = cm.Option() +destinationStorageClass.doc = 'the fully qualified name of the source storage class' +destinationStorageClass.default = 'socorro.storage.crashstorage.CrashStorageSystemForHBase' +destinationStorageClass.fromStringConverter = cm.classConverter + +from config.commonconfig import hbaseHost +from config.commonconfig import hbasePort +from config.commonconfig import hbaseTimeout + +#------------------------------------------------------------------------------- +# general + +searchRoot = cm.Option() +searchRoot.doc = 'the file system root at which to start the orphan search' +searchRoot.default = localFS.default + +numberOfThreads = cm.Option() +numberOfThreads.doc = 'the number of threads to use' +numberOfThreads.default = 4 + +dryrun = cm.Option() +dryrun.doc = "if True, don't actually move things into destination" +dryrun.default = False + +#------------------------------------------------------------------------------- +# logging + +syslogHost = cm.Option() +syslogHost.doc = 'syslog hostname' +syslogHost.default = 'localhost' + +syslogPort = cm.Option() +syslogPort.doc = 'syslog port' +syslogPort.default = 514 + +syslogFacilityString = cm.Option() +syslogFacilityString.doc = 'syslog facility string ("user", "local0", etc)' +syslogFacilityString.default = 'user' + +syslogLineFormatString = cm.Option() +syslogLineFormatString.doc = 'python logging system format for syslog entries' +syslogLineFormatString.default = 'Socorro Storage Mover (pid %(process)d): %(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +syslogErrorLoggingLevel = cm.Option() +syslogErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +syslogErrorLoggingLevel.default = 10 + +stderrLineFormatString = cm.Option() +stderrLineFormatString.doc = 'python logging system format for logging to stderr' +stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +stderrErrorLoggingLevel = cm.Option() +stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +stderrErrorLoggingLevel.default = 10 + diff --git a/scripts/config/processorconfig.py.dist b/scripts/config/processorconfig.py.dist new file mode 100644 index 0000000000..7c486120ec --- /dev/null +++ b/scripts/config/processorconfig.py.dist @@ -0,0 +1,418 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import socorro.lib.ConfigurationManager as cm +import datetime +import re +import stat + +processor_implementation = cm.Option() +processor_implementation.doc = 'select the processor vintage (2008, 2012)' +processor_implementation.default = 2008 + +#--------------------------------------------------------------------------- +# imported config + +from config.commonconfig import databaseHost +from config.commonconfig import databasePort +from config.commonconfig import databaseName +from config.commonconfig import databaseUserName +from config.commonconfig import databasePassword +from config.commonconfig import processorCheckInTime +from config.commonconfig import jsonFileSuffix +from config.commonconfig import dumpFileSuffix + +#--------------------------------------------------------------------------- +# HBase storage system +from config.commonconfig import hbaseStorageClass + +from config.commonconfig import hbaseHost +from config.commonconfig import hbasePort +from config.commonconfig import hbaseTimeout + +from config.commonconfig import secondaryHbaseHost +from config.commonconfig import secondaryHbasePort +from config.commonconfig import secondaryHbaseTimeout + +from config.commonconfig import statsdHost +from config.commonconfig import statsdPort +from config.commonconfig import statsdPrefix + +temporaryFileSystemStoragePath = cm.Option() +temporaryFileSystemStoragePath.doc = 'a local filesystem path where processor can write dumps temporarily for processing' +temporaryFileSystemStoragePath.default = '/home/socorro/temp' + +#--------------------------------------------------------------------------- +# local processor config + +from collectorconfig import dumpField + +elasticsearch_urls = cm.Option() +elasticsearch_urls.doc = 'the urls for the elasticsearch instances (leave blank to disable)' +elasticsearch_urls.default = 'http://localhost:9200/' + +elasticsearch_index = cm.Option() +elasticsearch_index.doc = "an index to insert crashes in elasticsearch (use datetime's strftime format to have daily, weekly or monthly indexes)" +elasticsearch_index.default = 'socorro%Y%W' + +elasticsearch_doctype = cm.Option() +elasticsearch_doctype.doc = 'a type to insert crashes in elasticsearch' +elasticsearch_doctype.default = 'crash_reports' + +elasticsearch_index_settings = cm.Option() +elasticsearch_index_settings.doc = 'the mapping of crash reports to insert' +elasticsearch_index_settings.default = './socorro/external/elasticsearch/socorro_index_settings.json' + +numberOfThreads = cm.Option() +numberOfThreads.doc = 'the number of threads to use' +numberOfThreads.default = 4 + +processorId = cm.Option() +processorId.doc = 'the id number for the processor (must already exist) (0 for create new Id, "auto" for autodetection, "host" for same host, "forcehost" for hostile take over)' +processorId.default = "forcehost" + +stackwalkCommandLine = cm.Option() +stackwalkCommandLine.doc = 'the template for the command to invoke minidump_stackwalk' +#for standard minidump_stackwalk uncomment this line: +stackwalkCommandLine.default = '$minidump_stackwalkPathname -m $dumpfilePathname $processorSymbolsPathnameList 2>/dev/null' +#for caching minidump_stackwalk uncomment this line: +#stackwalkCommandLine.default = '$minidump_stackwalkPathname -c $symbolCachePath -m $dumpfilePathname $processorSymbolsPathnameList 2>/dev/null' + +minidump_stackwalkPathname = cm.Option() +minidump_stackwalkPathname.doc = 'the full pathname of the extern program minidump_stackwalk (quote path with embedded spaces)' +minidump_stackwalkPathname.default = '/data/socorro/stackwalk/bin/minidump_stackwalk' + +exploitability_tool_command_line = cm.Option() +exploitability_tool_command_line.doc = 'the template for the command to invoke the exploitability tool' +exploitability_tool_command_line.default = '$exploitability_tool_pathname $dumpfilePathname 2>/dev/null' + +exploitability_tool_pathname = cm.Option() +exploitability_tool_pathname.doc = 'the full pathname of the extern program exploitability tool (quote path with embedded spaces)' +exploitability_tool_pathname.default = '/data/socorro/stackwalk/bin/exploitable' + +symbolCachePath = cm.Option() +symbolCachePath.doc = 'the path where the symbol cache is found (quote path with embedded spaces)' +symbolCachePath.default = '/mnt/socorro/symbols' + +processorSymbolsPathnameList = cm.Option() +processorSymbolsPathnameList.doc = 'comma or space separated list of symbol files for minidump_stackwalk (quote paths with embedded spaces)' +processorSymbolsPathnameList.default = "/mnt/socorro/symbols/symbols_ffx,/mnt/socorro/symbols/symbols_sea,/mnt/socorro/symbols/symbols_tbrd,/mnt/socorro/symbols/symbols_sbrd,/mnt/socorro/symbols/symbols_os" +processorSymbolsPathnameList.fromStringConverter = lambda x: x.replace(',', ' ') + +crashingThreadFrameThreshold = cm.Option() +crashingThreadFrameThreshold.doc = "the number of frames to keep in the raw dump for the crashing thread" +crashingThreadFrameThreshold.default = 100 + +crashingThreadTailFrameThreshold = cm.Option() +crashingThreadTailFrameThreshold.doc="the number of frames to keep in the raw dump at the tail of the frame list" +crashingThreadTailFrameThreshold.default = 10 + +processorLoopTime = cm.Option() +processorLoopTime.doc = 'the time to wait between attempts to get jobs (HHH:MM:SS)' +processorLoopTime.default = '0:00:06' +processorLoopTime.fromStringConverter = cm.timeDeltaConverter + +checkForPriorityFrequency = cm.Option() +checkForPriorityFrequency.doc = 'the time bewteen checks for priority jobs (HHH:MM:SS)' +checkForPriorityFrequency.default = '0:01:00' +checkForPriorityFrequency.fromStringConverter = cm.timeDeltaConverter + +processorCheckInFrequency = cm.Option() +processorCheckInFrequency.doc = 'the frequency in seconds for the processor to check in with the monitor' +processorCheckInFrequency.default = '0:05:00' +processorCheckInFrequency.fromStringConverter = cm.timeDeltaConverter + +batchJobLimit = cm.Option() +batchJobLimit.doc = 'the number of jobs to pull in a time' +batchJobLimit.default = 10000 + +#updateInterval = cm.Option() +#updateInterval.doc = 'How often to check for updates in this config file. Format 'dd:hh:mm:ss'. If 0, never update' +#updateInteval.default = '0:0:0:0' + +#signalNumber = cmOption() +#signalNumber.doc 'kill -signal_number the_process' causes configuration to be dynamically updated. +#signalNumber.default = signal.SIGALRM + +signatureSentinels = cm.Option() +signatureSentinels.doc = 'a list of frame signatures that should always be considered top of the stack if present in the stack' +signatureSentinels.default = ['_purecall', + ('mozilla::ipc::RPCChannel::Call(IPC::Message*, IPC::Message*)', + lambda stack: any('CreatePairedMinidumps' in signature for signature in stack)), + 'Java_org_mozilla_gecko_GeckoAppShell_reportJavaCrash', + 'google_breakpad::ExceptionHandler::HandleInvalidParameter(wchar_t const*, wchar_t const*, wchar_t const*, unsigned int, unsigned int)'] + +irrelevantSignatureRegEx = cm.Option() +irrelevantSignatureRegEx.doc = 'a regular expression matching frame signatures that should be ignored when generating an overall signature' +irrelevantSignatureRegEx.default = '|'.join([ + '@0x[0-9a-fA-F]{2,}', + '@0x[1-9a-fA-F]', + 'ashmem', + 'app_process@0x.*', + 'core\.odex@0x.*', + '_CxxThrowException', + 'dalvik-heap', + 'dalvik-jit-code-cache', + 'dalvik-LinearAlloc', + 'dalvik-mark-stack', + 'data@app@org\.mozilla\.fennec-\d\.apk@classes\.dex@0x.*', + 'framework\.odex@0x.*', + 'google_breakpad::ExceptionHandler::HandleInvalidParameter.*', + 'KiFastSystemCallRet', + 'libandroid_runtime\.so@0x.*', + 'libbinder\.so@0x.*', + 'libc\.so@.*', + 'libc-2\.5\.so@.*', + 'libEGL\.so@.*', + 'libdvm\.so\s*@\s*0x.*', + 'libgui\.so@0x.*', + 'libicudata.so@.*', + 'libMali\.so@0x.*', + 'libutils\.so@0x.*', + 'libz\.so@0x.*', + 'linux-gate\.so@0x.*', + 'mnt@asec@org\.mozilla\.fennec-\d@pkg\.apk@classes\.dex@0x.*', + 'MOZ_Assert', + 'MOZ_Crash', + 'mozcrt19.dll@0x.*', + 'mozilla::ipc::RPCChannel::Call\(IPC::Message\*, IPC::Message\*\)', + '_NSRaiseError', + '(Nt|Zw)WaitForSingleObject(Ex)?', + '(Nt|Zw)WaitForMultipleObjects(Ex)?', + 'nvmap@0x.*', + 'org\.mozilla\.fennec-\d\.apk@0x.*', + 'RaiseException', + 'RtlpAdjustHeapLookasideDepth', + 'system@framework@*\.jar@classes\.dex@0x.*', + '___TERMINATING_DUE_TO_UNCAUGHT_EXCEPTION___', + 'WaitForSingleObjectExImplementation', + 'WaitForMultipleObjectsExImplementation', + 'RealMsgWaitFor.*' + '_ZdlPv', + 'zero', + ]) + +prefixSignatureRegEx = cm.Option() +prefixSignatureRegEx.doc = 'a regular expression matching frame signatures that should always be coupled with the following frame signature when generating an overall signature' +prefixSignatureRegEx = '|'.join([ + '@0x0', + '.*abort', + '_alloca_probe.*', + '__android_log_assert', + 'arena_.*', + 'BaseGetNamedObjectDirectory', + '.*calloc', + 'cert_.*', + 'CERT_.*', + 'CFRelease', + '_chkstk', + 'CrashInJS', + '__delayLoadHelper2', + 'dlmalloc', + 'dlmalloc_trim', + 'dvm.*', + 'EtwEventEnabled', + 'fastcopy_I', + 'fastzero_I', + '_files_getaddrinfo', + '.*free', + 'GCGraphBuilder::NoteXPCOMChild', + 'getanswer', + 'huge_dalloc', + 'ialloc', + 'init_library', + 'isalloc', + 'je_malloc', + 'jemalloc_crash', + 'je_realloc', + 'JNI_CreateJavaVM', + '_JNIEnv.*', + 'JNI_GetCreatedJavaVM.*', + 'JS_DHashTableEnumerate', + 'JS_DHashTableOperate', + 'kill', + '__libc_android_abort', + 'libobjc.A.dylib@0x1568.', + '(libxul\.so|xul\.dll|XUL)@0x.*', + 'LL_.*', + 'malloc', + '_MD_.*', + 'memcmp', + '__memcmp16', + 'memcpy', + 'memmove', + 'memset', + 'mozalloc_abort.*', + 'mozalloc_handle_oom', + 'moz_free', + 'mozilla::AndroidBridge::AutoLocalJNIFrame::~AutoLocalJNIFrame', + 'mozilla::ipc::RPCChannel::Call', + 'mozilla::ipc::RPCChannel::CxxStackFrame::CxxStackFrame', + 'mozilla::ipc::RPCChannel::EnteredCxxStack', + 'mozilla::ipc::RPCChannel::Send', + 'moz_xmalloc', + 'moz_xrealloc', + 'NP_Shutdown', + 'nsCOMPtr.*', + 'NS_DebugBreak_P.*', + '[-+]\[NSException raise(:format:(arguments:)?)?\]', + 'nsObjCExceptionLogAbort(\(.*?\)){0,1}', + 'nsRefPtr.*', + 'nsTArray<.*', + 'nsTArray_base<.*', + 'NtUser.*', + 'objc_exception_throw', + 'objc_msgSend', + 'operator new\([^,\)]+\)', + 'NSS.*', + 'nss.*', + 'PL_.*', + 'port_.*', + 'PORT_.*', + '_PR_.*', + 'PR_.*', + 'pthread_mutex_lock', + '_purecall', + 'raise', + 'realloc', + 'recv', + '_RTC_Terminate', + 'Rtl.*', + '_Rtl.*', + '__Rtl.*', + 'SEC_.*Item', + 'seckey_.*', + 'SECKEY_.*', + 'send', + 'setjmp', + 'sigblock', + 'sigprocmask', + 'SocketAccept', + 'SocketAcceptRead', + 'SocketAvailable', + 'SocketAvailable64', + 'SocketBind', + 'SocketClose', + 'SocketConnect', + 'SocketGetName', + 'SocketGetPeerName', + 'SocketListen', + 'SocketPoll', + 'SocketRead', + 'SocketRecv', + 'SocketSend', + 'SocketShutdown', + 'SocketSync', + 'SocketTransmitFile', + 'SocketWrite', + 'SocketWritev', + 'ssl_.*', + 'SSL_.*', + 'strcat', + 'ssl3_.*', + 'strchr', + 'strcmp', + 'strcpy', + '.*strdup', + 'strlen', + 'strncpy', + 'strzcmp16', + 'strstr', + '__swrite', + 'TouchBadMemory', + '_VEC_memcpy', + '_VEC_memzero', + '.*WaitFor.*', + 'wcslen', + '__wrap_realloc', + 'WSARecv.*', + 'WSASend.*', + '_ZdaPvRKSt9nothrow_t\"', + 'zzz_AsmCodeRange_.*', + ]) + +signaturesWithLineNumbersRegEx = cm.Option() +signaturesWithLineNumbersRegEx.doc = 'any signatures that match this list should be combined with their associated source code line numbers' +signaturesWithLineNumbersRegEx.default = 'js_Interpret' + +collectAddon = cm.Option() +collectAddon.doc = "if true, parse and collect information about addons from the json file; if false, don't" +collectAddon.default = True +collectAddon.fromStringConverter = cm.booleanConverter + +collectCrashProcess = cm.Option() +collectCrashProcess.doc = "if true, parse and collect information about out of process crashes; if false, don't" +collectCrashProcess.default = True +collectCrashProcess.fromStringConverter = cm.booleanConverter + +#------------------------------------------------------------------------------- +# Logging + +syslogHost = cm.Option() +syslogHost.doc = 'syslog hostname' +syslogHost.default = 'localhost' + +syslogPort = cm.Option() +syslogPort.doc = 'syslog port' +syslogPort.default = 514 + +syslogFacilityString = cm.Option() +syslogFacilityString.doc = 'syslog facility string ("user", "local0", etc)' +syslogFacilityString.default = 'user' + +syslogLineFormatString = cm.Option() +syslogLineFormatString.doc = 'python logging system format for syslog entries' +syslogLineFormatString.default = 'Socorro Processor (pid %(process)d): %(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +syslogErrorLoggingLevel = cm.Option() +syslogErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +syslogErrorLoggingLevel.default = 10 + +stderrLineFormatString = cm.Option() +stderrLineFormatString.doc = 'python logging system format for logging to stderr' +stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +stderrErrorLoggingLevel = cm.Option() +stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +stderrErrorLoggingLevel.default = 40 + +# This Option is kind of a hack in that it should better be done with dynamic lookup, but this is what we have +knownFlashIdentifiers = cm.Option() +knownFlashIdentifiers.doc = 'A subset of the known "debug identifiers" for flash versions, associated to the version' +knownFlashIdentifiers.default = { + '7224164B5918E29AF52365AF3EAF7A500':'10.1.51.66', + 'C6CDEFCDB58EFE5C6ECEF0C463C979F80':'10.1.51.66', + '4EDBBD7016E8871A461CCABB7F1B16120':'10.1', + 'D1AAAB5D417861E6A5B835B01D3039550':'10.0.45.2', + 'EBD27FDBA9D9B3880550B2446902EC4A0':'10.0.45.2', + '266780DB53C4AAC830AFF69306C5C0300':'10.0.42.34', + 'C4D637F2C8494896FBD4B3EF0319EBAC0':'10.0.42.34', + 'B19EE2363941C9582E040B99BB5E237A0':'10.0.32.18', + '025105C956638D665850591768FB743D0':'10.0.32.18', + '986682965B43DFA62E0A0DFFD7B7417F0':'10.0.23', + '937DDCC422411E58EF6AD13710B0EF190':'10.0.23', + '860692A215F054B7B9474B410ABEB5300':'10.0.22.87', + '77CB5AC61C456B965D0B41361B3F6CEA0':'10.0.22.87', + '38AEB67F6A0B43C6A341D7936603E84A0':'10.0.12.36', + '776944FD51654CA2B59AB26A33D8F9B30':'10.0.12.36', + '974873A0A6AD482F8F17A7C55F0A33390':'9.0.262.0', + 'B482D3DFD57C23B5754966F42D4CBCB60':'9.0.262.0', + '0B03252A5C303973E320CAA6127441F80':'9.0.260.0', + 'AE71D92D2812430FA05238C52F7E20310':'9.0.246.0', + '6761F4FA49B5F55833D66CAC0BBF8CB80':'9.0.246.0', + '27CC04C9588E482A948FB5A87E22687B0':'9.0.159.0', + '1C8715E734B31A2EACE3B0CFC1CF21EB0':'9.0.159.0', + 'F43004FFC4944F26AF228334F2CDA80B0':'9.0.151.0', + '890664D4EF567481ACFD2A21E9D2A2420':'9.0.151.0', + '8355DCF076564B6784C517FD0ECCB2F20':'9.0.124.0', + '51C00B72112812428EFA8F4A37F683A80':'9.0.124.0', + '9FA57B6DC7FF4CFE9A518442325E91CB0':'9.0.115.0', + '03D99C42D7475B46D77E64D4D5386D6D0':'9.0.115.0', + '0CFAF1611A3C4AA382D26424D609F00B0':'9.0.47.0', + '0F3262B5501A34B963E5DF3F0386C9910':'9.0.47.0', + 'C5B5651B46B7612E118339D19A6E66360':'9.0.45.0', + 'BF6B3B51ACB255B38FCD8AA5AEB9F1030':'9.0.28.0', + '83CF4DC03621B778E931FC713889E8F10':'9.0.16.0', + } + diff --git a/scripts/config/revisionsconfig.py.dist b/scripts/config/revisionsconfig.py.dist new file mode 100644 index 0000000000..464ac77f93 --- /dev/null +++ b/scripts/config/revisionsconfig.py.dist @@ -0,0 +1,16 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import socorro.lib.ConfigurationManager as cm + +# Current Socorro and Breakpad revisions. This should be replaced at build time +# with the actual revision. + +socorro_revision = cm.Option() +socorro_revision.doc = 'the current revision of Socorro' +socorro_revision.default = 'CURRENT_SOCORRO_REVISION' + +breakpad_revision = cm.Option() +breakpad_revision.doc = 'the current revision of Breakpad' +breakpad_revision.default = 'CURRENT_BREAKPAD_REVISION' diff --git a/scripts/config/serverstatusconfig.py.dist b/scripts/config/serverstatusconfig.py.dist new file mode 100644 index 0000000000..3b251fefd0 --- /dev/null +++ b/scripts/config/serverstatusconfig.py.dist @@ -0,0 +1,61 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import socorro.lib.ConfigurationManager as cm +import datetime + +from config.commonconfig import databaseHost +from config.commonconfig import databaseName +from config.commonconfig import databaseUserName +from config.commonconfig import databasePassword + +debug = cm.Option() +debug.doc = 'do debug output and routines' +debug.default = False +debug.singleCharacter = 'D' +debug.fromStringConverter = cm.booleanConverter + +initMode = cm.Option() +initMode.doc = 'Use this the first time you run the script.' +initMode.default = False +initMode.singleCharacter = 'I' +initMode.fromStringConverter = cm.booleanConverter + +processingInterval = cm.Option() +processingInterval.doc = 'how often to process reports (HHH:MM:SS)' +processingInterval.default = '00:05:00' +processingInterval.fromStringConverter = cm.timeDeltaConverter + +#------------------------------------------------------------------------------- +# Logging + +syslogHost = cm.Option() +syslogHost.doc = 'syslog hostname' +syslogHost.default = 'localhost' + +syslogPort = cm.Option() +syslogPort.doc = 'syslog port' +syslogPort.default = 514 + +syslogFacilityString = cm.Option() +syslogFacilityString.doc = 'syslog facility string ("user", "local0", etc)' +syslogFacilityString.default = 'user' + +syslogLineFormatString = cm.Option() +syslogLineFormatString.doc = 'python logging system format for syslog entries' +syslogLineFormatString.default = 'Socorro Server Status (pid %(process)d): %(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +syslogErrorLoggingLevel = cm.Option() +syslogErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +syslogErrorLoggingLevel.default = 10 + +stderrLineFormatString = cm.Option() +stderrLineFormatString.doc = 'python logging system format for logging to stderr' +stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(message)s' + +stderrErrorLoggingLevel = cm.Option() +stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +stderrErrorLoggingLevel.default = 40 + + diff --git a/scripts/config/setupdatabaseconfig.py.dist b/scripts/config/setupdatabaseconfig.py.dist new file mode 100644 index 0000000000..07818fc1b0 --- /dev/null +++ b/scripts/config/setupdatabaseconfig.py.dist @@ -0,0 +1,41 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import socorro.lib.ConfigurationManager as cm + +from config.commonconfig import databaseHost +from config.commonconfig import databaseName +from config.commonconfig import databaseUserName +from config.commonconfig import databasePassword + +#------------------------------------------------------------------------------- +# Logging + +syslogHost = cm.Option() +syslogHost.doc = 'syslog hostname' +syslogHost.default = 'localhost' + +syslogPort = cm.Option() +syslogPort.doc = 'syslog port' +syslogPort.default = 514 + +syslogFacilityString = cm.Option() +syslogFacilityString.doc = 'syslog facility string ("user", "local0", etc)' +syslogFacilityString.default = 'user' + +syslogLineFormatString = cm.Option() +syslogLineFormatString.doc = 'python logging system format for syslog entries' +syslogLineFormatString.default = 'Socorro Setup Database (pid %(process)d): %(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +syslogErrorLoggingLevel = cm.Option() +syslogErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +syslogErrorLoggingLevel.default = 10 + +stderrLineFormatString = cm.Option() +stderrLineFormatString.doc = 'python logging system format for logging to stderr' +stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(message)s' + +stderrErrorLoggingLevel = cm.Option() +stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +stderrErrorLoggingLevel.default = 10 diff --git a/scripts/config/smtpconfig.py.dist b/scripts/config/smtpconfig.py.dist new file mode 100644 index 0000000000..4ec3f91c33 --- /dev/null +++ b/scripts/config/smtpconfig.py.dist @@ -0,0 +1,33 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import socorro.lib.ConfigurationManager as cm + +""" Note: This config file is used by Hoopsnake Email Campaign APIs + and these config options *must* be updated here, there is no + command line access """ + +smtpHostname = cm.Option() +smtpHostname.doc = 'The hostname of the SMTP provider' +smtpHostname.default = 'localhost' + +smtpPort = cm.Option() +smtpPort.doc = 'The port of the SMTP provider' +smtpPort.default = 25 + +smtpUsername = cm.Option() +smtpUsername.doc = 'The username for SMTP providers that require authentication otherwise set to None' +smtpUsername.default = None + +smtpPassword = cm.Option() +smtpPassword.doc = 'The password for SMTP providers that require authentication otherwise set to None' +smtpPassword.default = None + +fromEmailAddress = cm.Option() +fromEmailAddress.doc = 'Email Address which is used in the From field of all emails' +fromEmailAddress.default = 'no-reply@crash-stats.mozilla.com' + +unsubscribeBaseUrl = cm.Option() +unsubscribeBaseUrl.doc = 'The base url for handling un-subscribe requests. This will be used in email templates' +unsubscribeBaseUrl.default = "http://crash-stats.mozilla.com/email/subscription/%s" diff --git a/scripts/config/submitterconfig.py.dist b/scripts/config/submitterconfig.py.dist new file mode 100644 index 0000000000..29dd17a17f --- /dev/null +++ b/scripts/config/submitterconfig.py.dist @@ -0,0 +1,83 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import stat +import socorro.lib.ConfigurationManager as cm + +#------------------------------------------------------------------------------- +# general + +url = cm.Option() +url.doc = "The url of the server to load test" +url.default = 'https://crash-reports.stage.mozilla.com/submit' +url.singleCharacter = 'u' + +delay = cm.Option() +delay.doc = "pause between submission queing in milliseconds" +delay.default = 0 + +dryrun = cm.Option() +dryrun.doc = "don't actually submit, just print product/version" +dryrun.default = False +dryrun.singleCharacter = 'D' + +numberOfThreads = cm.Option() +numberOfThreads.doc = 'the number of threads to use' +numberOfThreads.default = 4 + +numberOfSubmissions = cm.Option() +numberOfSubmissions.doc = 'the number of crashes to submit (all, forever, 1...)' +numberOfSubmissions.default = 'all' +numberOfSubmissions.singleCharacter = 'n' + +jsonfile = cm.Option() +jsonfile.doc = 'the pathname of a json file to submit' +jsonfile.default = None +jsonfile.singleCharacter = 'j' + +dumpfile = cm.Option() +dumpfile.doc = 'the pathname of a dumpfile to submit' +dumpfile.default = None +dumpfile.singleCharacter = 'd' + +searchRoot = cm.Option() +searchRoot.doc = 'a filesystem location to begin a search for json/dump pairs' +searchRoot.default = None +searchRoot.singleCharacter = 's' + +uniqueHangId = cm.Option() +uniqueHangId.doc = 'cache and uniquify hangids' +uniqueHangId.default = True + +#------------------------------------------------------------------------------- +# logging + +syslogHost = cm.Option() +syslogHost.doc = 'syslog hostname' +syslogHost.default = 'localhost' + +syslogPort = cm.Option() +syslogPort.doc = 'syslog port' +syslogPort.default = 514 + +syslogFacilityString = cm.Option() +syslogFacilityString.doc = 'syslog facility string ("user", "local0", etc)' +syslogFacilityString.default = 'user' + +syslogLineFormatString = cm.Option() +syslogLineFormatString.doc = 'python logging system format for syslog entries' +syslogLineFormatString.default = 'Socorro Submitter (pid %(process)d): %(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +syslogErrorLoggingLevel = cm.Option() +syslogErrorLoggingLevel.doc = 'logging level for the log file (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +syslogErrorLoggingLevel.default = 10 + +stderrLineFormatString = cm.Option() +stderrLineFormatString.doc = 'python logging system format for logging to stderr' +stderrLineFormatString.default = '%(asctime)s %(levelname)s - %(threadName)s - %(message)s' + +stderrErrorLoggingLevel = cm.Option() +stderrErrorLoggingLevel.doc = 'logging level for the logging to stderr (10 - DEBUG, 20 - INFO, 30 - WARNING, 40 - ERROR, 50 - CRITICAL)' +stderrErrorLoggingLevel.default = 10 + diff --git a/scripts/install.sh b/scripts/install.sh index 3dadb93586..98638f1b1f 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -15,11 +15,8 @@ if [ "$BUILD_TYPE" != "tar" ]; then # FIXME could we replace w/ consul? cp scripts/crons/socorrorc $BUILD_DIR/etc/socorro/ - # Copy systemd service files into place - cp config/systemd/* $BUILD_DIR/usr/lib/systemd/system/ - - # Copy in production-style defaults - cp -rp config/package/* $BUILD_DIR/etc/ + # Copy system configs into place + rsync -a config/package/ $BUILD_DIR/ # Copy in Socorro setup script cp scripts/setup-socorro.sh $BUILD_DIR/usr/bin @@ -32,6 +29,22 @@ else rsync -a config $BUILD_DIR/application fi +# record current git revision in root of install dir +git rev-parse HEAD > socorro_revision.txt +cp stackwalk/revision.txt breakpad_revision.txt + +# Write down build number, if ran by Jenkins +if [ -n "$BUILD_NUMBER" ] +then + echo "$BUILD_NUMBER" > JENKINS_BUILD_NUMBER +else + echo "unknown" > JENKINS_BUILD_NUMBER +fi + +# install socorro in local virtualenv +# this must run at the end to capture any generated files above +${VIRTUAL_ENV}/bin/python setup.py install + # copy to install directory rsync -a ${VIRTUAL_ENV} $BUILD_DIR rsync -a socorro $BUILD_DIR/application @@ -47,16 +60,15 @@ rsync -a webapp-django $BUILD_DIR/ # because this file is served from the parent of the `webapp-django/` directory cp contribute.json $BUILD_DIR/ -# record current git revision in install dir -git rev-parse HEAD > $BUILD_DIR/application/socorro/external/postgresql/socorro_revision.txt -cp $BUILD_DIR/stackwalk/revision.txt $BUILD_DIR/application/socorro/external/postgresql/breakpad_revision.txt +# TODO remove these when we no longer need to support pre-RPM releases +cp socorro_revision.txt $BUILD_DIR/application/socorro +cp breakpad_revision.txt $BUILD_DIR/application/socorro -# Write down build number, if ran by Jenkins -if [ -n "$BUILD_NUMBER" ] -then - echo "$BUILD_NUMBER" > $BUILD_DIR/JENKINS_BUILD_NUMBER -fi -if [ "$BUILD_TYPE" != "tar" ]; then +if [ "$BUILD_TYPE" == "tar" ]; then + pushd $BUILD_DIR/application/scripts/config + for file in *.py.dist; do cp $file `basename $file .dist`; done + popd +else BUILD_DIR=${BUILD_DIR%%/data/socorro} fi diff --git a/scripts/movecrashes.py b/scripts/movecrashes.py new file mode 100755 index 0000000000..c23a4f75cc --- /dev/null +++ b/scripts/movecrashes.py @@ -0,0 +1,418 @@ +#!/usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +"""Export crash reports data from a PostgreSQL database or import crash reports +data into an ElasticSearch database. + +Usage: %s [-h host[:port]] command [arg1 [arg2...]] + +Commands: +export - Export crashes from a database. + export_from_db [path=. [numberofdays=0]] +import - Import a dump file into an ElasticSearch instance. + import dumpfile mappingfile +clear - Delete all socorro related indexes. + clear +rebuild - Clear database and import a dump. + rebuild dumpfile mappingfile + +Options: +-h -- ElasticSearch host and port. Default is 'localhost:9200'. + +""" + +import csv +import datetime +import json +import os +import tarfile +import time +import sys + +import config.commonconfig as configModule + +import socorro.database.database as db +import socorro.lib.ConfigurationManager as configurationManager +import socorro.lib.datetimeutil as dtu +import socorro.lib.httpclient as httpc + +config = configurationManager.newConfiguration( + configurationModule=configModule, + applicationName='movecrashes.py' +) + + +def export_uuids(path, numberofdays): + """Export crash report uuids from a PostgreSQL database to a CSV file + + path - Directory where the csv file will be created. + numberofdays - Number of days of crash reports to retrieve, before the most + recent crash date. + + """ + database = db.Database(config) + connection = database.connection() + cur = connection.cursor() + + # steps + # 1. pull all distinct dates + sql = """ + SELECT DISTINCT to_char(date_processed, 'YYYY-MM-DD') as day + FROM reports + ORDER BY day DESC + """ + if numberofdays: + sql = "%s LIMIT %s" % (sql, numberofdays) + + print 'Calculating dates... ' + days = db.execute(cur, sql) + + days_list = [] + for day in days: + days_list.append(day[0]) + + store_filename = 'uuids.csv' + store_filename = os.path.normpath('%s/%s' % (path, store_filename)) + store_file = open(store_filename, 'w') + store = csv.writer(store_file, delimiter=',', quotechar='"') + print 'Store file created: %s' % store_filename + + for day in days_list: + date_from = dtu.datetimeFromISOdateString(day) + date_to = date_from + datetime.timedelta(1) + + sql = "SELECT uuid FROM reports WHERE date_processed BETWEEN %s AND %s" + + print 'Getting crash reports for day %s' % date_from.date() + crashes_list = db.execute(cur, sql, (date_from, date_to)) + for crash in crashes_list: + store.writerow(crash) + + store_file.close() + connection.close() + return store_filename + + +def export(path, numberofdays=0): + """Export crash reports from a PostgreSQL database. + + path - Directory where the dump file will be created. + numberofdays - Number of days of crash reports to retrieve, before the most + recent crash date. + + """ + database = db.Database(config) + connection = database.connection() + cur = connection.cursor() + + crash_files = [] + fields_list = ("client_crash_date", + "date_processed", + "uuid", + "product", + "version", + "build", + "signature", + "url", + "install_age", + "last_crash", + "uptime", + "cpu_name", + "cpu_info", + "reason", + "address", + "os_name", + "os_version", + "email", + "build_date", + "user_id", + "started_datetime", + "completed_datetime", + "success", + "truncated", + "processor_notes", + "user_comments", + "app_notes", + "distributor", + "distributor_version", + "topmost_filenames", + "addons_checked", + "flash_version", + "hangid", + "process_type", + "release_channel") + + # steps + # 1. pull all distinct dates + sql = """ + SELECT DISTINCT to_char(date_processed, 'YYYY-MM-DD') as day + FROM reports + ORDER BY day DESC + """ + if numberofdays: + sql = "%s LIMIT %s" % (sql, numberofdays) + + print 'Calculating dates... ' + days = db.execute(cur, sql) + + days_list = [] + for day in days: + days_list.append(day[0]) + + #~ days_list = [ + #~ '2012-03-04T00:00:00+00:00' + #~ ] + + store_filename = 'dump.json' + store_filename = os.path.normpath('%s/%s' % (path, store_filename)) + store = open(store_filename, 'w') + print 'Store file created: %s' % store_filename + + indexes_filename = 'indexes.txt' + indexes_filename = os.path.normpath('%s/%s' % (path, indexes_filename)) + indexes = open(indexes_filename, 'w') + print 'Indexes file created: %s' % indexes_filename + + for day in days_list: + date_from = dtu.datetimeFromISOdateString(day) + date_to = date_from + datetime.timedelta(1) + datestr = date_from.strftime('%y%m%d') + es_index = 'socorro_%s' % datestr + es_type = 'crash_reports' + action_line = '{"index":{"_index":"%s","_type":"%s"}}\n' % ( + es_index, es_type) + + indexes.write('%s\n' % es_index) + + # 2. for each date, pull all crashes of the day + day_sql = " ".join(("SELECT %s" % ", ".join(fields_list), + "FROM reports", + "WHERE date_processed BETWEEN %s AND %s")) + + print 'Getting crash reports for day %s' % date_from.date() + crashes_list = db.execute(cur, day_sql, (date_from, date_to)) + for crash in crashes_list: + # 3. for each crash report + json_crash = dict(zip(fields_list, crash)) + + # stringify datetime fields + for i in json_crash: + if isinstance(json_crash[i], datetime.datetime): + json_crash[i] = dtu.date_to_string(json_crash[i]) + + store.write(action_line) + store.write('%s\n' % json.dumps(json_crash)) + + store.close() + crash_files.append(store_filename) + indexes.close() + crash_files.append(indexes_filename) + connection.close() + return generate_dump(crash_files, path) + + +def generate_dump(files, path): + """Return the filename of a tar file containing all given files. + """ + os.chdir(path) + dumpfilename = './dump.tar' + dumpfile = tarfile.open(dumpfilename, 'w') + for name in files: + dumpfile.add(name.replace(path, '')) + dumpfile.close() + + return dumpfilename + + +def import_dump(es_connection, dump_filename, mapping_filename): + """Import a dump into an ElasticSearch instance. + + filename - Path to the dump. + es_connection - HTTP connection to ElasticSearch instance. + + """ + print 'Importing crashes from dump %s' % dump_filename + + dump = tarfile.open(dump_filename) + path = '/tmp/' + dump.extractall(path) + members = dump.getnames() + crash_file_handlers = [] + + for crash_file in members: + if 'indexes' in crash_file: + indexes_file_handler = open('%s%s' % (path, crash_file), 'r') + else: + crash_file_handlers.append(open('%s%s' % (path, crash_file), 'r')) + + # PUT mapping for each index + for es_index in indexes_file_handler: + es_index = '/%s' % es_index.strip() + es_uri = '%s/crash_reports' % es_index + es_connection.put(es_index) + import_mapping(es_connection, es_uri, mapping_filename) + + indexes_file_handler.close() + + sys.stdout.write('Indexing crash reports \r') + sys.stdout.flush() + for crash_file_handler in crash_file_handlers: + i = 0 + j = 0 + maxLines = 50000 + stream = [] + for line in crash_file_handler: + stream.append(line) + i += 1 + if i >= maxLines: + j += i + sys.stdout.write('Indexing crash reports... %d \r' % j) + sys.stdout.flush() + es_connection.post('/_bulk?refresh=true', ''.join(stream)) + time.sleep(20) + + i = 0 + stream = [] + print '\rIndexing crash reports... %d' % (j + i) + es_connection.post('/_bulk', ''.join(stream)) + crash_file_handler.close() + print "Indexing done" + + +def delete_existing_indexes(es_connection): + """Delete all socorro related indexes from an ElasticSearch instance. + + Concerned indexes are the ones matching '*socorro_*'. + + """ + print 'Clearing ElasticSearch instance... ' + + http_response = es_connection.get("/_status") + + try: + indexes = json.loads(http_response) + except TypeError: + print "An error occured while getting a list of all indexes from ES" + print http_response + + for index in indexes["indices"]: + if "socorro_" in index: + http_response = es_connection.delete(index) + + +def import_mapping(es_connection, es_uri, mapping_filename): + """ + """ + mapping = open(mapping_filename) + uri = '%s/_mapping' % es_uri + print 'Importing mapping from file %s to %s' % (mapping_filename, uri) + es_connection.post(uri, mapping.read()) + mapping.close() + + +if __name__ == '__main__': + # timing execution + start_time = time.time() + + # default values + day = datetime.date.today() + numberofdays = 7 + crashes_per_day = 1000 + es_host = 'localhost' + es_port = '9200' + + def usage(): + print __doc__ % sys.argv[0] + + if len(sys.argv) <= 1 or sys.argv[1] == '--help': + usage() + sys.exit(0) + + argi = 1 + if sys.argv[argi] == '-h': + parts = sys.argv[argi + 1].split(':') + es_host = parts[0] + if len(parts) == 2: + es_port = int(parts[1]) + argi += 2 + + cmd = sys.argv[argi] + args = sys.argv[argi + 1:] + + es_connection = httpc.HttpClient(es_host, es_port, timeout=60) + + if cmd == 'export': + # default values + path = '.' + numberofdays = 0 + + if len(args) >= 1: + path = args[0] + if len(args) >= 2: + numberofdays = args[1] + + cfile = export(path, numberofdays) + print 'Generated crash file: %s' % cfile + + if cmd == 'export_uuids': + # default values + path = '.' + numberofdays = 0 + + if len(args) >= 1: + path = args[0] + if len(args) >= 2: + numberofdays = args[1] + + cfile = export_uuids(path, numberofdays) + print 'Generated uuids file: %s' % cfile + + elif cmd == 'import': + if len(args) != 2: + usage() + sys.exit(1) + dump = args[0] + mapping = args[1] + with es_connection: + import_dump(es_connection, dump, mapping) + print 'Imported dump: %s' % dump + + elif cmd == 'clear': + with es_connection: + delete_existing_indexes(es_connection) + print 'Database cleared' + + elif cmd == 'rebuild': + if len(args) != 2: + usage() + sys.exit(1) + dump = args[0] + mapping = args[1] + with es_connection: + delete_existing_indexes(es_connection) + import_dump(es_connection, dump, mapping) + print 'Database cleared and rebuilt from dump: %s' % dump + + else: + usage() + sys.exit(0) + + # Nicely displaying the total time of execution + exec_time = time.time() - start_time + exec_hours = 0 + exec_minutes = 0 + exec_seconds = 0 + + if exec_time > 3600: + exec_hours = exec_time / 3600 + exec_time = exec_time % 3600 + if exec_time > 60: + exec_minutes = exec_time / 60 + exec_time = exec_time % 60 + exec_seconds = exec_time + + print "Execution time: %d hours, %d minutes and %d seconds" % ( + exec_hours, exec_minutes, exec_seconds) diff --git a/scripts/startBugzilla.py b/scripts/startBugzilla.py new file mode 100644 index 0000000000..455a460c3f --- /dev/null +++ b/scripts/startBugzilla.py @@ -0,0 +1,37 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import sys +import logging +import logging.handlers + +try: + import config.bugzillaconfig as configModule +except ImportError: + import bugzillaconfig as configModule + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.cron.bugzilla as bug +import socorro.lib.util as sutil + +try: + config = configurationManager.newConfiguration(configurationModule=configModule, applicationName="Bugzilla Associations 0.1") +except configurationManager.NotAnOptionError, x: + print >>sys.stderr, x + print >>sys.stderr, "for usage, try --help" + sys.exit() + +logger = logging.getLogger("bugzilla") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +try: + bug.record_associations(config) +finally: + logger.info("done.") + diff --git a/scripts/startDailyMatviews.py b/scripts/startDailyMatviews.py new file mode 100644 index 0000000000..a4b5996078 --- /dev/null +++ b/scripts/startDailyMatviews.py @@ -0,0 +1,43 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import sys +import logging +import logging.handlers +from datetime import date, timedelta + +try: + import config.dailyMatviewsConfig as configModule +except ImportError: + import dailyMatviewsConfig as configModule + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.cron.dailyMatviews as dailyMatviews +import socorro.lib.util as sutil + +try: + config = configurationManager.newConfiguration( + configurationModule=configModule, applicationName="dailyMatviews 0.1") +except configurationManager.NotAnOptionError, x: + print >>sys.stderr, x + print >>sys.stderr, "for usage, try --help" + sys.exit() + +logger = logging.getLogger("dailyMatviews") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +exitCode = 255 + +try: + targetDate = date.today() - timedelta(1) + exitCode = dailyMatviews.update(config, targetDate) +finally: + logger.info("done.") + +sys.exit(exitCode) diff --git a/scripts/startDailyUrl.py b/scripts/startDailyUrl.py new file mode 100755 index 0000000000..eb474b4c51 --- /dev/null +++ b/scripts/startDailyUrl.py @@ -0,0 +1,39 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import sys +import logging +import logging.handlers + +try: + import config.dailyurlconfig as configModule +except ImportError: + import dailyurlconfig as configModule + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.cron.dailyUrl as url +import socorro.lib.util as sutil + +try: + config = configurationManager.newConfiguration(configurationModule=configModule, applicationName="Daily URL Dump 0.1") +except configurationManager.NotAnOptionError, x: + print >>sys.stderr, x + print >>sys.stderr, "for usage, try --help" + sys.exit() + +logger = logging.getLogger("dailyUrlDump") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +try: + url.dailyUrlDump(config) +finally: + logger.info("done.") + + + diff --git a/scripts/startDuplicates.py b/scripts/startDuplicates.py new file mode 100644 index 0000000000..ecfce013a7 --- /dev/null +++ b/scripts/startDuplicates.py @@ -0,0 +1,39 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import sys +import logging +import logging.handlers + +try: + import config.duplicatesconfig as configModule +except ImportError: + import duplicatesconfig as configModule + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.cron.duplicates as duplicates +import socorro.lib.util as sutil + +try: + config = configurationManager.newConfiguration(configurationModule=configModule, applicationName="Duplicate Detector 0.1") +except configurationManager.NotAnOptionError, x: + print >>sys.stderr, x + print >>sys.stderr, "for usage, try --help" + sys.exit() + +logger = logging.getLogger("duplicates") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +try: + duplicates.find_duplicates(config) +finally: + logger.info("done.") + + + diff --git a/scripts/startFixBrokenDumps.py b/scripts/startFixBrokenDumps.py new file mode 100644 index 0000000000..e8290de308 --- /dev/null +++ b/scripts/startFixBrokenDumps.py @@ -0,0 +1,41 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging +import logging.handlers +import sys + +try: + import config.fixbrokendumpsconfig as configModule +except ImportError: + import fixbrokendumpsconfig as configModule + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.cron.fixBrokenDumps as fixBrokenDumps +import socorro.lib.util as sutil + +try: + config = configurationManager.newConfiguration(configurationModule=configModule, applicationName="Fix Broken Dumps") +except configurationManager.NotAnOptionError, x: + print >>sys.stderr, x + print >>sys.stderr, "for usage, try --help" + sys.exit(1) + +logger = logging.getLogger("fix_broken_dumps") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +try: + #last_date_processed = fixBrokenDumps.fix(config, logger, config.brokenFirefoxLinuxQuery, config.brokenFirefoxLinuxFixer) + #last_date_processed = fixBrokenDumps.fix(config, logger, config.brokenFennecQuery, config.brokenFennecFixer) + last_date_processed = fixBrokenDumps.fix(config, logger, config.brokenBoot2GeckoQuery, config.brokenBoot2GeckoFixer) + + fixBrokenDumps.save_last_run_date(config, last_date_processed) + logger.debug('stored last_date_processed: %s' % last_date_processed) +finally: + logger.info("done.") diff --git a/scripts/startFtpScraper.py b/scripts/startFtpScraper.py new file mode 100644 index 0000000000..f50a0e3912 --- /dev/null +++ b/scripts/startFtpScraper.py @@ -0,0 +1,52 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +startFtpScraper.py is used to get the primary nightly builds from +ftp.mozilla.org, record the build information and provide that +information through the Crash Reporter website. + +This script can be run as often as desired, and will automatically backfill. +""" + +import logging +import logging.handlers +import datetime + +try: + import config.ftpscraperconfig as configModule +except ImportError: + import ftpscraperconfig as configModule + +import socorro.cron.ftpscraper as ftpscraper +import socorro.lib.ConfigurationManager as cfgManager +import socorro.lib.util as sutil + +config = cfgManager.newConfiguration(configurationModule=configModule, + applicationName='startFtpScraper.py') +assert "databaseHost" in config, "databaseHost missing from config" +assert "databaseName" in config, "databaseName missing from config" +assert "databaseUserName" in config, "databaseUserName missing from config" +assert "databasePassword" in config, "databasePassword missing from config" +assert "base_url" in config, "base_url missing from config" +assert "products" in config, "products missing from config" +assert "backfillDate" in config, "backfillDate missing from config" + +logger = logging.getLogger("ftpscraper") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +config.logger = logger + +try: + backfill_date = None + if config.backfillDate != None: + backfill_date = datetime.datetime.strptime(config.backfillDate, + '%Y-%m-%d') + ftpscraper.recordBuilds(config, backfill_date=backfill_date) +finally: + logger.info("Done.") diff --git a/scripts/startMonitor.py b/scripts/startMonitor.py new file mode 100755 index 0000000000..74bf169cdf --- /dev/null +++ b/scripts/startMonitor.py @@ -0,0 +1,40 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import sys +import logging +import logging.handlers + +try: + import config.monitorconfig as configModule +except ImportError: + import monitorconfig as configModule + +import socorro.monitor.monitor as monitor +import socorro.lib.ConfigurationManager as configurationManager +import socorro.lib.util as sutil + +try: + config = configurationManager.newConfiguration(configurationModule=configModule, applicationName="Socorro Monitor 2.0") +except configurationManager.NotAnOptionError, x: + print >>sys.stderr, x + print >>sys.stderr, "for usage, try --help" + sys.exit() + +logger = logging.getLogger("monitor") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +try: + while True: + m = monitor.Monitor(config) + m.start() +finally: + logger.info("done.") + + diff --git a/scripts/startReportsClean.py b/scripts/startReportsClean.py new file mode 100644 index 0000000000..560bf44792 --- /dev/null +++ b/scripts/startReportsClean.py @@ -0,0 +1,37 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import sys +import logging +import logging.handlers + +try: + import config.reportsclean as configModule +except ImportError: + import reportsclean as configModule + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.cron.reportsClean as reportsClean +import socorro.lib.util as sutil + +try: + config = configurationManager.newConfiguration(configurationModule=configModule, applicationName="Reports Clean 0.1") +except configurationManager.NotAnOptionError, x: + print >>sys.stderr, x + print >>sys.stderr, "for usage, try --help" + sys.exit() + +logger = logging.getLogger("reportsClean") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +try: + reportsClean.update_reports_clean(config) +finally: + logger.info("done.") + diff --git a/scripts/startServerStatus.py b/scripts/startServerStatus.py new file mode 100755 index 0000000000..d86d2fed7a --- /dev/null +++ b/scripts/startServerStatus.py @@ -0,0 +1,36 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging +import logging.handlers +import sys + +try: + import config.serverstatusconfig as configModule +except ImportError: + import serverstatusconfig as configModule + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.cron.serverstatus as serverstatus +import socorro.lib.util as sutil + +try: + config = configurationManager.newConfiguration(configurationModule=configModule, applicationName="Server Status Summary") +except configurationManager.NotAnOptionError, x: + print >>sys.stderr, x + print >>sys.stderr, "for usage, try --help" + sys.exit(1) + +logger = logging.getLogger("server_status_summary") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +try: + serverstatus.update(config, logger) +finally: + logger.info("done.") diff --git a/scripts/startTopCrashesBySignature.py b/scripts/startTopCrashesBySignature.py new file mode 100755 index 0000000000..ce5f3a611a --- /dev/null +++ b/scripts/startTopCrashesBySignature.py @@ -0,0 +1,40 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging +import logging.handlers +import sys +import time + +try: + import config.topCrashesBySignatureConfig as configModule +except ImportError: + import topCrashesBySignatureConfig as configModule + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.cron.topCrashesBySignature as topcrasher +import socorro.lib.util as sutil + +try: + config = configurationManager.newConfiguration(configurationModule=configModule, applicationName="Top Crashes Summary") +except configurationManager.NotAnOptionError, x: + print >>sys.stderr, x + print >>sys.stderr, "for usage, try --help" + sys.exit(1) + +logger = logging.getLogger("topCrashBySignature") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +try: + before = time.time() + tc = topcrasher.TopCrashesBySignature(config) + count = tc.processDateInterval() + logger.info("Successfully processed %s items in %3.2f seconds",count, time.time()-before) +finally: + logger.info("done.") diff --git a/scripts/startTopCrashesByUrl.py b/scripts/startTopCrashesByUrl.py new file mode 100755 index 0000000000..9e73be2789 --- /dev/null +++ b/scripts/startTopCrashesByUrl.py @@ -0,0 +1,35 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging +import logging.handlers +import sys +import time + +try: + import config.topCrashesByUrlConfig as configModule +except ImportError: + import topCrashesByUrlConfig as configModule + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.cron.topCrashesByUrl as tcbyurl +import socorro.lib.util as sutil + +config = configurationManager.newConfiguration(configurationModule=configModule, applicationName="Top Crash By URL Summary") + +logger = logging.getLogger("topCrashesByUrl") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +try: + before = time.time() + tu = tcbyurl.TopCrashesByUrl(config) + tu.processDateInterval() + logger.info("Successfully ran in %d seconds" % (time.time() - before)) +finally: + logger.info("done.") diff --git a/scripts/startUpdateADUs.py b/scripts/startUpdateADUs.py new file mode 100644 index 0000000000..030a86a62e --- /dev/null +++ b/scripts/startUpdateADUs.py @@ -0,0 +1,38 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import sys +import logging +import logging.handlers + +try: + import config.updateadus as configModule +except ImportError: + import updateadus as configModule + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.cron.updateADUs as updateADUs +import socorro.lib.util as sutil + +try: + config = configurationManager.newConfiguration( + configurationModule=configModule, applicationName="Update ADUs 0.1") +except configurationManager.NotAnOptionError, x: + print >>sys.stderr, x + print >>sys.stderr, "for usage, try --help" + sys.exit() + +logger = logging.getLogger("updateADUs") +logger.setLevel(logging.DEBUG) + +sutil.setupLoggingHandlers(logger, config) +sutil.echoConfig(logger, config) + +try: + updateADUs.update_adus(config) +finally: + logger.info("done.") + diff --git a/scripts/test.sh b/scripts/test.sh index 28ee2060d5..6b24675c76 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -1,5 +1,4 @@ #! /bin/bash -ex - source scripts/defaults NOSE="$VIRTUAL_ENV/bin/nosetests socorro -s" diff --git a/socorro/collector/submitter_app.py b/socorro/collector/submitter_app.py index 35539d186d..87dbd1632f 100755 --- a/socorro/collector/submitter_app.py +++ b/socorro/collector/submitter_app.py @@ -24,7 +24,7 @@ CrashStorageBase, FileDumpsMapping, ) -from socorro.external.fs.filesystem import findFileGenerator +from socorro.external.filesystem.filesystem import findFileGenerator from socorro.lib.util import DotDict from socorro.external.postgresql.dbapi2_util import execute_query_iter diff --git a/socorro/cron/dailyUrl.py b/socorro/cron/dailyUrl.py new file mode 100644 index 0000000000..dcd79e93c9 --- /dev/null +++ b/socorro/cron/dailyUrl.py @@ -0,0 +1,225 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +###!/usr/bin/python + + +import logging +import copy +import datetime as dt +import gzip +import csv +import time +import os.path + +import contextlib + +logger = logging.getLogger("dailyUrlDump") + +import socorro.database.database as sdb +import socorro.lib.util as util + +from socorro.database.cachedIdAccess import IdCache + +sql = """ + select + r.signature, -- 0 + r.url, -- 1 + 'http://crash-stats.mozilla.com/report/index/' || r.uuid as uuid_url, -- 2 + to_char(r.client_crash_date,'YYYYMMDDHH24MI') as client_crash_date, -- 3 + to_char(r.date_processed,'YYYYMMDDHH24MI') as date_processed, -- 4 + r.last_crash, -- 5 + r.product, -- 6 + r.version, -- 7 + r.build, -- 8 + '' as branch, -- 9 + r.os_name, --10 + r.os_version, --11 + r.cpu_name || ' | ' || r.cpu_info as cpu_info, --12 + r.address, --13 + array(select ba.bug_id from bug_associations ba where ba.signature = r.signature) as bug_list, --14 + r.user_comments, --15 + r.uptime as uptime_seconds, --16 + case when (r.email is NULL OR r.email='') then '' else r.email end as email, --17 + (select sum(adi_count) from raw_adi adi + where adi.date = '%(now_str)s' + and r.product = adi.product_name and r.version = adi.product_version + and substring(r.os_name from 1 for 3) = substring(adi.product_os_platform from 1 for 3) + and r.os_version LIKE '%%'||adi.product_os_version||'%%') as adu_count, --18 + r.topmost_filenames, --19 + case when (r.addons_checked is NULL) then '[unknown]'when (r.addons_checked) then 'checked' else 'not' end as addons_checked, --20 + r.flash_version, --21 + r.hangid, --22 + r.reason, --23 + r.process_type, --24 + r.app_notes, --25 + r.install_age, --26 + rd.duplicate_of, --27 + r.release_channel, --28 + r.productid --29 + from + reports r left join reports_duplicates rd on r.uuid = rd.uuid + where + '%(yesterday_str)s' <= r.date_processed and r.date_processed < '%(now_str)s' + %(prod_phrase)s %(ver_phrase)s + order by 5 -- r.date_processed, munged + """ + +#------------------------------------------------------------------------------- +def setup_query_parameters(config): + now = config.day + dt.timedelta(1) + now_str = now.strftime('%Y-%m-%d') + yesterday = config.day + yesterday_str = yesterday.strftime('%Y-%m-%d') + logger.debug("config.day = %s; now = %s; yesterday = %s", + config.day, + now, + yesterday) + prod_phrase = '' + try: + if config.product != '': + if ',' in config.product: + prod_list = [x.strip() for x in config.product.split(',')] + prod_phrase = ("and r.product in ('%s')" % + "','".join(prod_list)) + else: + prod_phrase = "and r.product = '%s'" % config.product + except Exception: + util.reportExceptionAndContinue(logger) + ver_phrase = '' + try: + if config.version != '': + if ',' in config.product: + ver_list = [x.strip() for x in config.version.split(',')] + ver_phrase = ("and r.version in ('%s')" % + "','".join(ver_list)) + else: + ver_phrase = "and r.version = '%s'" % config.version + except Exception: + util.reportExceptionAndContinue(logger) + + return util.DotDict({ 'now_str' : now_str, + 'yesterday_str' : yesterday_str, + 'prod_phrase' : prod_phrase, + 'ver_phrase' : ver_phrase}) + +#------------------------------------------------------------------------------- +@contextlib.contextmanager +def gzipped_csv_files(config, gzip=gzip, csv=csv): + private_out_filename = ("%s-crashdata.csv.gz" + % config.day.strftime('%Y%m%d')) + private_out_pathname = os.path.join(config.outputPath, + private_out_filename) + private_gzip_file_handle = gzip.open(private_out_pathname, "w") + private_csv_file_handle = csv.writer(private_gzip_file_handle, + delimiter='\t', + lineterminator='\n') + + pubic_out_filename = ("%s-pub-crashdata.csv.gz" + % config.day.strftime('%Y%m%d')) + public_out_pathname = None + public_out_directory = config.get('publicOutputPath') + public_gzip_file_handle = None + public_csv_file_handle = None + if public_out_directory: + public_out_pathname = os.path.join(public_out_directory, + pubic_out_filename) + public_gzip_file_handle = gzip.open(public_out_pathname, "w") + public_csv_file_handle = csv.writer(public_gzip_file_handle, + delimiter='\t', + lineterminator='\n') + else: + logger.info("Will not create public (bowdlerized) gzip file") + yield (private_csv_file_handle, public_csv_file_handle) + private_gzip_file_handle.close() + if public_gzip_file_handle: + public_gzip_file_handle.close() + +#------------------------------------------------------------------------------- +def process_crash(a_crash_row, id_cache): + column_value_list = [] + os_name = None + ooid = '' + for i, x in enumerate(a_crash_row): + if x is None: + x = r'\N' + if i == 2: + ooid = x.rsplit('/',1)[-1] + if i == 10: #r.os_name + x = os_name = x.strip() + if i == 11: #r.os_version + # per bug 519703 + x = id_cache.getAppropriateOsVersion(os_name, x) + os_name=None + if i == 14: #bug_associations.bug_id + x = ','.join(str(bugid) for bugid in x) + if i == 15: #r.user_comments + x = x.replace('\t',' '); # per bug 519703 + if i == 17: #r.email -- show 'email' if the email is likely useful + # per bugs 529431/519703 + if '@' in x: + x='yes' + else: + x = '' + if type(x) == str: + x = x.strip().replace('\r','').replace('\n',' | ') + column_value_list.append(x) + return column_value_list + +#------------------------------------------------------------------------------- +def write_row(file_handles_tuple, + crash_list): + """ + Write a row to each file: Seen by internal users (full details), and + external users (bowdlerized) + """ + private_file_handle, public_file_handle = file_handles_tuple + # logger.debug("Writing crash %s (%s)",crash_list,len(crash_list)) + private_file_handle.writerow(crash_list) + crash_list[1] = 'URL (removed)' # remove url + crash_list[17] = '' # remove email + if public_file_handle: + public_file_handle.writerow(crash_list) + +#------------------------------------------------------------------------------- +def dailyUrlDump(config, sdb=sdb, + gzipped_csv_files=gzipped_csv_files, + IdCache=IdCache, + write_row=write_row, + process_crash=process_crash, + logger=logger): + dbConnectionPool = sdb.DatabaseConnectionPool(config, logger) + # Set the temp_buffers for this session + databaseTempbuffers = '8MB' # default + if 'databaseTempbuffers' in config: + databaseTempbuffers = config.databaseTempbuffers + try: + try: + db_conn, db_cursor = dbConnectionPool.connectionCursorPair() + + with gzipped_csv_files(config) as csv_file_handles_tuple: + headers_not_yet_written = True + id_cache = IdCache(db_cursor) + sql_parameters = setup_query_parameters(config) + logger.debug("config.day = %s; now = %s; yesterday = %s", + config.day, + sql_parameters.now_str, + sql_parameters.yesterday_str) + sql_query = sql % sql_parameters + logger.debug("SQL is: %s", sql_query) + db_cursor.execute(""" SET TEMP_BUFFERS = %s """, (databaseTempbuffers,)); + for crash_row in sdb.execute(db_cursor, sql_query): + if headers_not_yet_written: + write_row(csv_file_handles_tuple, + [x[0] for x in db_cursor.description]) + headers_not_yet_written = False + column_value_list = process_crash(crash_row, id_cache) + write_row(csv_file_handles_tuple, + column_value_list) + # end for loop over each crash_row + finally: + dbConnectionPool.cleanup() + except: + util.reportExceptionAndContinue(logger) + diff --git a/socorro/cron/jobs/daily_url.py b/socorro/cron/jobs/daily_url.py new file mode 100644 index 0000000000..c89736c9da --- /dev/null +++ b/socorro/cron/jobs/daily_url.py @@ -0,0 +1,347 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import contextlib +import csv +import datetime +import gzip +import os.path +import subprocess + +from configman import Namespace +from crontabber.base import BaseCronApp +from crontabber.mixins import ( + as_backfill_cron_app, + with_postgres_transactions, + with_single_postgres_transaction +) +from socorro.database.cachedIdAccess import IdCache +from socorro.lib.util import DotDict + +SQL = """ +select + r.signature, -- 0 + r.url, -- 1 + 'http://crash-stats.mozilla.com/report/index/' || r.uuid as uuid_url, -- 2 + to_char(r.client_crash_date,'YYYYMMDDHH24MI') as client_crash_date, -- 3 + to_char(r.date_processed,'YYYYMMDDHH24MI') as date_processed, -- 4 + r.last_crash, -- 5 + r.product, -- 6 + r.version, -- 7 + r.build, -- 8 + '' as branch, -- 9 + r.os_name, --10 + r.os_version, --11 + r.cpu_name || ' | ' || r.cpu_info as cpu_info, --12 + r.address, --13 + array(select ba.bug_id from bug_associations ba where ba.signature = r.signature) as bug_list, --14 + r.user_comments, --15 + r.uptime as uptime_seconds, --16 + case when (r.email is NULL OR r.email='') then '' else r.email end as email, --17 + (select sum(adi_count) from raw_adi adu + where adu.date = '%(now_str)s' + and r.product = adu.product_name and r.version = adu.product_version + and substring(r.os_name from 1 for 3) = substring(adu.product_os_platform from 1 for 3) + and r.os_version LIKE '%%'||adu.product_os_version||'%%') as adu_count, --18 + r.topmost_filenames, --19 + case when (r.addons_checked is NULL) then '[unknown]'when (r.addons_checked) then 'checked' else 'not' end as addons_checked, --20 + r.flash_version, --21 + r.hangid, --22 + r.reason, --23 + r.process_type, --24 + r.app_notes, --25 + r.install_age, --26 + rd.duplicate_of, --27 + r.release_channel, --28 + r.productid --29 +from + reports r left join reports_duplicates rd on r.uuid = rd.uuid +where + '%(yesterday_str)s' <= r.date_processed and r.date_processed < '%(now_str)s' + %(prod_phrase)s %(ver_phrase)s +order by 5 -- r.date_processed, munged +""" + + +@as_backfill_cron_app +@with_postgres_transactions() +@with_single_postgres_transaction() +class DailyURLCronApp(BaseCronApp): + app_name = 'daily-url' + app_version = '1.0' + app_description = "" + + required_config = Namespace() + required_config.add_option( + 'output_path', + default='.', + doc="file system location to put the 'internal/private' " + "output csv file" + ) + required_config.add_option( + 'public_output_path', + default='.', + doc="file system location to put the 'external/public' " + "output csv file" + ) + required_config.add_option( + 'product', + default='Firefox', + doc="a comma delimited list of the products to track " + "(leave blank for all)" + ) + required_config.add_option( + 'version', + default='', + doc="a comma delimited list of the versions to track " + "(leave blank for all)" + ) + + # private scp + required_config.add_option( + 'private_user', + default='', + doc="User that will scp/ssh the private file" + ) + required_config.add_option( + 'private_server', + default='', + doc="Server to scp/ssh to" + ) + required_config.add_option( + 'private_location', + default='/tmp/', + doc="FS location to scp the file to" + ) + required_config.add_option( + 'private_ssh_command', + default='', + doc="Optional extra ssh command to send" + ) + # public scp + required_config.add_option( + 'public_user', + default='', + doc="User that will scp/ssh the public file" + ) + required_config.add_option( + 'public_server', + default='', + doc="Server to scp/ssh to" + ) + required_config.add_option( + 'public_location', + default='/tmp/%Y-%m-%d/', + doc="FS location to scp the file to" + ) + required_config.add_option( + 'public_ssh_command', + default='', + doc="Optional extra ssh command to send" + ) + + def run(self, connection, date): + logger = self.config.logger + cursor = connection.cursor() + # this is a rather unfortunate name hotpot. + # The argument "date" is a datetime.datetime instance + # .date() turns it into a datetime.date instance + day = (date - datetime.timedelta(days=1)).date() + with self.gzipped_csv_files(self.config, day) as files_tuple: + file_handles_tuple, file_names_tuple = files_tuple + headers_not_yet_written = True + id_cache = IdCache(cursor) + sql_parameters = self.setup_query_parameters(self.config, day) + logger.debug("day = %s; now = %s; yesterday = %s", + day, + sql_parameters.now_str, + sql_parameters.yesterday_str) + sql_query = SQL % sql_parameters + logger.debug("SQL is: %s", sql_query) + cursor.execute(sql_query) + for crash_row in cursor.fetchall(): + if headers_not_yet_written: + self.write_row( + file_handles_tuple, + [x[0] for x in cursor.description] + ) + headers_not_yet_written = False + column_value_list = self.process_crash(crash_row, id_cache) + self.write_row(file_handles_tuple, + column_value_list) + # end for loop over each crash_row + + private_out_pathname, public_out_pathname = file_names_tuple + self.scp_file(private_out_pathname, day) + if public_out_pathname: + self.scp_file(public_out_pathname, day, public=True) + + def scp_file(self, file_path, day, public=False): + + if public: + user = self.config.public_user + server = self.config.public_server + location = self.config.public_location + ssh_command = self.config.public_ssh_command + else: + user = self.config.private_user + server = self.config.private_server + location = self.config.private_location + ssh_command = self.config.private_ssh_command + + if '%' in location: + location = day.strftime(location) + + if not server: + return + + if user: + user += '@' + + command = 'scp "%s" "%s%s:%s"' % (file_path, user, server, location) + proc = subprocess.Popen( + command, + shell=True, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE, + stderr=subprocess.PIPE + ) + stdout, stderr = proc.communicate() + if stderr: + self.config.logger.warn( + "Error when scp'ing the file %s: %s" % (file_path, stderr) + ) + + if ssh_command: + command = 'ssh "%s%s" "%s"' % (user, server, ssh_command) + proc = subprocess.Popen( + command, + shell=True, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE, + stderr=subprocess.PIPE + ) + stdout, stderr = proc.communicate() + if stderr: + self.config.logger.warn( + "Error when sending ssh command (%s): %s" + % (ssh_command, stderr) + ) + + @staticmethod + def write_row(file_handles_tuple, crash_list): + """ + Write a row to each file: Seen by internal users (full details), and + external users (bowdlerized) + """ + private_file_handle, public_file_handle = file_handles_tuple + private_file_handle.writerow(crash_list) + crash_list[1] = 'URL (removed)' # remove url + crash_list[17] = '' # remove email + if public_file_handle: + public_file_handle.writerow(crash_list) + + @staticmethod + @contextlib.contextmanager + def gzipped_csv_files(config, day): + """Note: creating an empty csv.gz file with no content (i.e. no rows) + is not a bug. External systems will look at the filenames and will + assume the presence of files with every dates date in them.""" + logger = config.logger + private_out_filename = ("%s-crashdata.csv.gz" + % day.strftime('%Y%m%d')) + private_out_pathname = os.path.join(config.output_path, + private_out_filename) + private_gzip_file_handle = gzip.open(private_out_pathname, "w") + private_csv_file_handle = csv.writer(private_gzip_file_handle, + delimiter='\t', + lineterminator='\n') + + pubic_out_filename = ("%s-pub-crashdata.csv.gz" + % day.strftime('%Y%m%d')) + public_out_pathname = None + public_out_directory = config.get('public_output_path') + public_gzip_file_handle = None + public_csv_file_handle = None + if public_out_directory: + public_out_pathname = os.path.join(public_out_directory, + pubic_out_filename) + public_gzip_file_handle = gzip.open(public_out_pathname, "w") + public_csv_file_handle = csv.writer(public_gzip_file_handle, + delimiter='\t', + lineterminator='\n') + else: + logger.info("Will not create public (bowdlerized) gzip file") + # yield a tuple of two tuples + yield ( + (private_csv_file_handle, public_csv_file_handle), + (private_out_pathname, public_out_pathname) + ) + private_gzip_file_handle.close() + if public_gzip_file_handle: + public_gzip_file_handle.close() + + @staticmethod + def process_crash(a_crash_row, id_cache): + column_value_list = [] + os_name = None + ooid = '' + for i, x in enumerate(a_crash_row): + if x is None: + x = r'\N' + if i == 10: # r.os_name + x = os_name = x.strip() + if i == 11: # r.os_version + # per bug 519703 + x = id_cache.getAppropriateOsVersion(os_name, x) + os_name = None + if i == 14: # bug_associations.bug_id + x = ','.join(str(bugid) for bugid in x) + if i == 15: # r.user_comments + x = x.replace('\t', ' ') # per bug 519703 + if i == 17: # r.email + # -- show 'email' if the email is likely useful + # per bugs 529431/519703 + if '@' in x: + x = 'yes' + else: + x = '' + if isinstance(x, basestring): + x = x.strip().replace('\r', '').replace('\n', ' | ') + column_value_list.append(x) + return column_value_list + + @staticmethod + def setup_query_parameters(config, day): + now = day + datetime.timedelta(1) + now_str = now.strftime('%Y-%m-%d') + yesterday = day + yesterday_str = yesterday.strftime('%Y-%m-%d') + config.logger.debug( + "day = %s; now = %s; yesterday = %s", + day, + now, + yesterday + ) + prod_phrase = '' + if config.product: + if ',' in config.product: + prod_list = [x.strip() for x in config.product.split(',')] + prod_phrase = ( + "and r.product in ('%s')" % "','".join(prod_list) + ) + else: + prod_phrase = "and r.product = '%s'" % config.product + ver_phrase = '' + if config.version != '': + if ',' in config.product: + ver_list = [x.strip() for x in config.version.split(',')] + ver_phrase = "and r.version in ('%s')" % "','".join(ver_list) + else: + ver_phrase = "and r.version = '%s'" % config.version + + return DotDict({'now_str': now_str, + 'yesterday_str': yesterday_str, + 'prod_phrase': prod_phrase, + 'ver_phrase': ver_phrase}) diff --git a/socorro/database/migrations/migrateProcessType.py b/socorro/database/migrations/migrateProcessType.py new file mode 100644 index 0000000000..8ca80822fc --- /dev/null +++ b/socorro/database/migrations/migrateProcessType.py @@ -0,0 +1,104 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import logging +import logging.handlers +import re +import sys +import time + +import socorro.lib.ConfigurationManager as cm + +from config.commonconfig import databaseHost +from config.commonconfig import databaseName +from config.commonconfig import databaseUserName +from config.commonconfig import databasePassword + +import config.commonconfig as config + +import psycopg2 + +import socorro.lib.util as lib_util +import socorro.database.postgresql as db_pgsql + + + +import socorro.lib.ConfigurationManager as configurationManager + +all_tables_sql = """ + SELECT table_name FROM information_schema.tables + WHERE table_schema='public' AND + table_type='BASE TABLE' AND + table_name LIKE 'reports_%' + ORDER BY table_name""" + +migrate_process_type_sql = """ + UPDATE %s SET process_type = 'plugin' + FROM %s + WHERE %s.process_type IS NULL AND %s.report_id = %s.id """ + +def migrate_process_type_params(reports, plugins_reports): + """ Makes a tuple suitable for prepared statment """ + return (reports, plugins_reports, reports, plugins_reports, reports) + +def main(): + try: + logger = setupLog() + configContext = setupConfig() + logger.info("current configuration\n%s", str(configContext)) + conn = None + try: + testConfig(configContext) + databaseDSN = "host=%(databaseHost)s dbname=%(databaseName)s user=%(databaseUserName)s password=%(databasePassword)s" % configContext + # Be sure self.connection is closed before you quit! + conn = psycopg2.connect(databaseDSN) + cursor = conn.cursor() + cursor.execute(all_tables_sql) + tables = cursor.fetchall() + for reports in tables: + logger.info("Processing %s" % reports[0]) + plugins_reports = "plugins_%s" % reports[0] + params = migrate_process_type_params(reports[0], plugins_reports) + try: + cursor.execute(migrate_process_type_sql % params) + logger.info("%d rows updated" % cursor.rowcount) + conn.commit() + except psycopg2.ProgrammingError, x: + logging.warn("Skipping %s as %s doesn't exist" % (reports[0], plugins_reports)) + conn.rollback() + conn.close() + except (psycopg2.OperationalError, AssertionError),x: + lib_util.reportExceptionAndAbort(logger) + finally: + logger.info("done.") + +def setupLog(): + logger = logging.getLogger("migrateProcessType") + logger.setLevel(10) + + stderrLog = logging.StreamHandler() + stderrLog.setLevel(10) + stderrLogFormatter = logging.Formatter('%(asctime)s %(levelname)s - %(message)s') + stderrLog.setFormatter(stderrLogFormatter) + logger.addHandler(stderrLog) + return logger + +def setupConfig(): + try: + return configurationManager.newConfiguration(configurationModule=config, applicationName="Migrate Process Type") + except configurationManager.NotAnOptionError, x: + print >>sys.stderr, x + print >>sys.stderr, "for usage, try --help" + sys.exit(1) + +def testConfig(configContext): + assert "databaseHost" in configContext, "databaseHost is missing from the configuration" + assert "databaseName" in configContext, "databaseName is missing from the configuration" + assert "databaseUserName" in configContext, "databaseUserName is missing from the configuration" + assert "databasePassword" in configContext, "databasePassword is missing from the configuration" + +if __name__ == "__main__": + main() diff --git a/socorro/database/postgresql.py b/socorro/database/postgresql.py new file mode 100644 index 0000000000..036a085e94 --- /dev/null +++ b/socorro/database/postgresql.py @@ -0,0 +1,145 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# XXX Set to be deprecated in favor of socorro/external/postgresql/models.py +# XXX And possibly long deprecated in favor of socorro/external/postgresql + +#----------------------------------------------------------------------------------------------------------------- +def tablesMatchingPattern(tableNamePattern, databaseCursor): + """ return a list of the names of all indexes for the given table""" + databaseCursor.execute(""" + select + ct.relname + from + pg_class ct + where + ct.relname like '%s' + and ct.reltype <> 0""" % tableNamePattern) + return [x[0] for x in databaseCursor.fetchall()] + +#----------------------------------------------------------------------------------------------------------------- +def triggersForTable(tableName, databaseCursor): + """ return a list of the names of all indexes for the given table""" + databaseCursor.execute(""" + select + pg_trigger.tgname + from + pg_trigger join pg_class on pg_trigger.tgrelid = pg_class.oid and pg_class.relname = '%s'""" % tableName) + return [x[0] for x in databaseCursor.fetchall()] + +#----------------------------------------------------------------------------------------------------------------- +def indexesForTable(tableName, databaseCursor): + """ return a list of the names of all indexes for the given table""" + databaseCursor.execute(""" + select + it.relname + from + pg_class ct join pg_index i on ct.oid = i.indrelid and ct.relname = '%s' + join pg_class it on it.oid = i.indexrelid""" % tableName) + return [x[0] for x in databaseCursor.fetchall()] + +#----------------------------------------------------------------------------------------------------------------- +def rulesForTable(tableName, databaseCursor): + """ return a list of the names of all rules for the given table""" + databaseCursor.execute(""" + select + rulename + from + pg_rules + where + tablename = '%s'""" % tableName) + return [x[0] for x in databaseCursor.fetchall()] + +#----------------------------------------------------------------------------------------------------------------- +def constraintsAndTypeForTable(tableName, databaseCursor): + """return a list of (constraintName, constraintType) tuples for the given table""" + databaseCursor.execute(""" + select + conname, + contype + from + pg_constraint cn join pg_class cls on cn.conrelid = cls.oid and cls.relname = '%s'""" % tableName) + return [x for x in databaseCursor.fetchall()] + +#----------------------------------------------------------------------------------------------------------------- +def columnNameTypeDictionaryForTable (tableName, databaseCursor): + """ return a dictionary of column types keys by column name""" + databaseCursor.execute(""" + select + pg_attribute.attname as columnname, + pg_type.typname as columntype + from + pg_type join pg_attribute on pg_type.oid = pg_attribute.atttypid + join pg_class on (pg_attribute.attrelid = pg_class.oid and pg_class.relname = '%s') + where + pg_type.typname not in ('oid', 'cid', 'tid', 'xid') + order by + pg_attribute.attname""" % tableName) + namesToTypesDict = {} + for aRow in databaseCursor.fetchall(): + namesToTypesDict[aRow[0]] = aRow[1] + return namesToTypesDict + +#----------------------------------------------------------------------------------------------------------------- +def childTablesForTable(tableName, databaseCursor): + """ return a list of tables that are children (via inherits) for the given table""" + databaseCursor.execute(""" + select + cls1.relname + from + pg_class cls1 join pg_inherits inh on cls1.oid = inh.inhrelid + join pg_class cls2 on inh.inhparent = cls2.oid and cls2.relname = '%s'""" % tableName) + return [x[0] for x in databaseCursor.fetchall()] + +def connectionStatus(aConnection): + """Debugging aid. Particularly note transaction status of 'INTRANS' and 'INERROR'""" + statusStrings = { + 0:'SETUP', 1:'READY', 2:'BEGIN', 3:'SYNC',4:'ASYNC', + } + transStatusStrings = { + 0:'IDLE', 1:'ACTIVE', 2:'INTRANS', 3:'INERROR', 4:'UNKNOWN', + } + return "Status: %s, Transaction Status: %s"%(statusStrings.get(aConnection.status,'UNK'),transStatusStrings.get(aConnection.get_transaction_status(),"UNK")) + +def getSequenceNameForColumn(tableName, columnName, cursor): + """ + Return the name of the sequence which provides defaults for columns of type serial + returns None if the values don't identify a column that owns a sequence + Does NOT commit() the connection. + Thanks to postgres experts Jonathan Daugherty and Alvaro Herrera + http://archives.postgresql.org/pgsql-general/2004-10/msg01375.php # Re: determine sequence name for a serial + """ + sql = """SELECT seq.relname::text + FROM pg_class src, pg_class seq, pg_namespace, pg_attribute, pg_depend + WHERE + pg_depend.refobjsubid = pg_attribute.attnum AND + pg_depend.refobjid = src.oid AND + seq.oid = pg_depend.objid AND + src.relnamespace = pg_namespace.oid AND + pg_attribute.attrelid = src.oid AND + pg_namespace.nspname = 'public' AND + src.relname = %s AND + pg_attribute.attname = %s""" + cursor.execute(sql,(tableName,columnName)) + data = cursor.fetchone() + if data: + data = data[0] + return data + +def getCurrentValue(tableName, columnName, cursor): + """ + Find out which (id) was most recently set for a table and column name. Else None if unavailable. + Does NOT commit() the connection + ---- + NOTE: 'SELECT lastval()' is often better: http://www.postgresql.org/docs/8.3/interactive/functions-sequence.html + """ + ret = None + seq = getSequenceNameForColumn(tableName,columnName,cursor) + if seq: + try: + cursor.execute("SELECT currval(%s)",(seq,)) + ret = cursor.fetchone()[0] + except: + ret = None + return ret diff --git a/socorro/database/schema.py b/socorro/database/schema.py new file mode 100644 index 0000000000..57fb75354e --- /dev/null +++ b/socorro/database/schema.py @@ -0,0 +1,1058 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# XXX Set to be deprecated in favor of socorro/external/postgresql/models.py + +import psycopg2 as pg +import datetime as dt +import threading + +import socorro.lib.prioritize as socorro_pri +import socorro.lib.psycopghelper as socorro_psy +import socorro.database.postgresql as socorro_pg + +import socorro.lib.util as socorro_util +""" +Schema.py contains several utility functions and the code which describes most of the database tables used by socorro. +However, large portions of Schema.py are out of date and the file is slated for replacement by different code. +""" + +#----------------------------------------------------------------------------------------------------------------- +def mondayPairsIteratorFactory(minDate, maxDate): + """ + Given a pair of dates, creates iterator that returns (aMonday,theNextMonday) such that + - the first returned pair defines an interval holding minDate + - the last returned pair defines an interval holding maxDate + if minDate or maxDate are not instances of datetime.date, raises TypeError + if maxDate > minDate, raises ValueError + """ + if not (isinstance(minDate,dt.date) and isinstance(maxDate,dt.date)): + raise TypeError("minDate and maxDate must be instances of datetime.date") + if maxDate < minDate: + raise ValueError("minDate must be <= maxDate") + def anIterator(): + oneWeek = dt.timedelta(7) + aDate = minDate - dt.timedelta(minDate.weekday()) # begin on Monday before minDate + while aDate <= maxDate: + nextMonday = aDate + oneWeek + yield (aDate, nextMonday) + aDate = nextMonday + return anIterator() + +#----------------------------------------------------------------------------------------------------------------- +# For each database TableClass below, +# databaseDependenciesForSetup[TableClass] = [List of TableClasses on which this TableClass depends] +# NOTE: This requires that new Tables be added textually below every Table on which they depend +databaseDependenciesForSetup = {} +def getOrderedSetupList(whichTables = None): + """ + A helper function to get the correct order to create tables during setup. + whichTables is a list of Tables, possibly empty, or None + If not whichTables, then all the known tables are visited + """ + # if whichTables is None, then databaseDependenciesForSetup.keys() is used + return socorro_pri.dependencyOrder(databaseDependenciesForSetup,whichTables) +databaseDependenciesForPartition = {} +def getOrderedPartitionList(whichTables): + """ + A helper function to get the needed PartionedTables for a given set of PartitionedTables + """ + if not whichTables: + return [] + order = socorro_pri.dependencyOrder(databaseDependenciesForPartition,whichTables) + return order + +# This set caches knowledge of existing partition tables to avoid hitting database. Beware cache incoherence +partitionCreationHistory = set() +#----------------------------------------------------------------------------------------------------------------- +def partitionWasCreated(partitionTableName): + """Helper function to examine partitionCreationHistory""" + return partitionTableName in partitionCreationHistory +#----------------------------------------------------------------------------------------------------------------- +def markPartitionCreated(partitionTableName): + """Helper function to update partitionCreationHistory""" + global partitionCreationHistory + partitionCreationHistory.add(partitionTableName) + +#================================================================================================================= +class PartitionControlParameterRequired(Exception): + def __init__(self): + super(PartitionControlParameterRequired, self).__init__("No partition control paramter was supplied") + +#================================================================================================================= +class DatabaseObject(object): + """ + Base class for all objects (Tables, Constraints, Indexes) that may be individually created and used in the database + Classes that inherit DatabaseObject: + - Must supply appropriate creationSql parameter to the superclass constructor + - May override method additionalCreationProcedure(self,aDatabaseCursor). If this is provided, it is + called after creationSql is executed in method create(self,aDatabaseCursor) + The cursor's connection is neither committed nor rolled back during the call to create + - May override methods which do nothing in this class: + = drop(self,aDatabaseCursor) + = updateDefinition(self,aDatabaseCursor) + = createPartitions(self,aDatabaseCursor,aPartitionDetailsIterator) + Every leaf class that inherits DatabaseObject should be aware of the module-level dictionary: databaseDependenciesForSetup. + If that leaf class should be created when the database is being set up, the class itself must be added as a key in the + databaseDependenciesForSetup dictionary. The value associated with that key is a possibly empty iterable containing the + classes on which the particular leaf class depends: Those that must already be created before the particular instance is + created. This is often because the particular table has one or more foreign keys referencing tables upon which it depends. + """ + #----------------------------------------------------------------------------------------------------------------- + def __init__(self, name=None, logger=None, creationSql=None, **kwargs): + super(DatabaseObject, self).__init__() + self.name = name + self.creationSql = creationSql + self.logger = logger + #----------------------------------------------------------------------------------------------------------------- + def _createSelf(self,databaseCursor): + databaseCursor.execute(self.creationSql) + self.additionalCreationProcedures(databaseCursor) + #----------------------------------------------------------------------------------------------------------------- + def create(self, databaseCursor): + orderedDbObjectList = getOrderedSetupList([self.__class__]) + for dbObjectClass in orderedDbObjectList: + dbObjectObject = self + if not self.__class__ == dbObjectClass: + dbObjectObject = dbObjectClass(logger = self.logger) + databaseCursor.execute("savepoint creating_%s"%dbObjectObject.name) + try: + dbObjectObject._createSelf(databaseCursor) + databaseCursor.execute("release savepoint creating_%s"%dbObjectObject.name) + except pg.ProgrammingError,x: + databaseCursor.execute("rollback to creating_%s"%dbObjectObject.name) + databaseCursor.connection.commit() + self.logger.debug("%s - in create for %s, dbObject %s exists",threading.currentThread().getName(),self.name,dbObjectObject.name) + + #----------------------------------------------------------------------------------------------------------------- + def additionalCreationProcedures(self, databaseCursor): + pass + #----------------------------------------------------------------------------------------------------------------- + def updateDefinition(self, databaseCursor): + pass + #----------------------------------------------------------------------------------------------------------------- + def drop(self, databaseCursor): + pass + #----------------------------------------------------------------------------------------------------------------- + def createPartitions(self, databaseCursor, iterator): + pass + +#================================================================================================================= +class Table (DatabaseObject): + """ + Base class for all Table objects that may be created and used in the database. + Classes that inherit DatabaseObject: + - Must supply appropriate creationSql parameter to the superclass constructor + - May override method insert(self,rowTuple, **kwargs) to do the right thing during an insert + - May provide method alterColumnDefinitions(self,aDatabaseCursor,tableName) + - May provide method updateDefinition(self,aDatabaseCursor) + - Must be aware of databaseDependenciesForSetup and how it is used + class Table inherits method create from DatabaseObject + class Table provides a reasonable implementation of method drop, overriding the empty one in DatabaseObject + """ + #----------------------------------------------------------------------------------------------------------------- + def __init__(self, name=None, logger=None, creationSql=None, **kwargs): + super(Table, self).__init__(name=name, logger=logger, creationSql=creationSql, **kwargs) + #----------------------------------------------------------------------------------------------------------------- + def drop(self, databaseCursor): + databaseCursor.execute("drop table if exists %s cascade" % self.name) + #----------------------------------------------------------------------------------------------------------------- + def insert(self, rowTuple=None, **kwargs): + pass + +#================================================================================================================= +class PartitionedTable(Table): + """ + Base class for Tables that will be partitioned or are likely to be programmatically altered. + Classes that inherit PartitionedTable + - Must supply self.insertSql with 'TABLENAME' replacing the actual table name + - Must supply appropriate creationSql and partitionCreationSqlTemplate to the superclass constructor + - Should NOT override method insert, which does something special for PartitionedTables + - May override method partitionCreationParameters(self, partitionDetails) which returns a dictionary suitable for string formatting + + Every leaf class that inherits PartitionedTable should be aware of the module-level dictionary: databaseDependenciesForPartition + If that leaf class has a partition that depends upon some other partition, then it must be added as a key to the dictionary + databaseDependenciesForPartition. The value associated with that key is an iterable containing the classes that define the partitions + on which this particular leaf class depends: Those that must already be created before the particular instance is created. This is + most often because the particular partition table has one or more foreign keys referencing partition tables upon which it depends. + """ + #----------------------------------------------------------------------------------------------------------------- + partitionCreationLock = threading.RLock() + def __init__ (self, name=None, logger=None, creationSql=None, partitionNameTemplate='%s', partitionCreationSqlTemplate='', weekInterval=None, **kwargs): + super(PartitionedTable, self).__init__(name=name, logger=logger, creationSql=creationSql) + self.partitionNameTemplate = partitionNameTemplate + self.partitionCreationSqlTemplate = partitionCreationSqlTemplate + self.weekInterval = weekInterval + if not weekInterval: + today = dt.date.today() + self.weekInterval = mondayPairsIteratorFactory(today,today) + self.insertSql = None + + #----------------------------------------------------------------------------------------------------------------- + #def additionalCreationProcedures(self, databaseCursor): + #self.createPartitions(databaseCursor, self.weekInterval) + #----------------------------------------------------------------------------------------------------------------- + def _createOwnPartition(self, databaseCursor, uniqueItems): + """ + Internal method that assumes all precursor partitions are already in place before creating this one. Called + from createPartitions(same parameters) to avoid bottomless recursion. Creates one or more partitions for + this particular table, (more if uniqueItems has more than one element) + side effect: Cursor's connection has been committed() by the time we return + """ + self.logger.debug("%s - in createOwnPartition for %s",threading.currentThread().getName(),self.name) + for x in uniqueItems: + #self.logger.debug("DEBUG - item value is %s",x) + partitionCreationParameters = self.partitionCreationParameters(x) + partitionName = self.partitionNameTemplate % partitionCreationParameters["partitionName"] + if partitionWasCreated(partitionName): + #self.logger.debug("DEBUG - skipping creation of %s",partitionName) + continue + partitionCreationSql = self.partitionCreationSqlTemplate % partitionCreationParameters + #self.logger.debug("%s - Sql for %s is %s",threading.currentThread().getName(),self.name,partitionCreationSql) + aPartition = Table(name=partitionName, logger=self.logger, creationSql=partitionCreationSql) + self.logger.debug("%s - savepoint createPartitions_%s",threading.currentThread().getName(), partitionName) + databaseCursor.execute("savepoint createPartitions_%s" % partitionName) + try: + self.logger.debug("%s - creating %s", threading.currentThread().getName(), partitionName) + aPartition._createSelf(databaseCursor) + markPartitionCreated(partitionName) + self.logger.debug("%s - successful - releasing savepoint", threading.currentThread().getName()) + databaseCursor.execute("release savepoint createPartitions_%s" % partitionName) + except pg.ProgrammingError, x: + self.logger.debug("%s -- Rolling back and releasing savepoint: Creating %s failed in createPartitions: %s", threading.currentThread().getName(), partitionName, str(x).strip()) + databaseCursor.execute("rollback to createPartitions_%s; release savepoint createPartitions_%s;" % (partitionName, partitionName)) + databaseCursor.connection.commit() + + #----------------------------------------------------------------------------------------------------------------- + def createPartitions(self, databaseCursor, iterator): + """ + Create this table's partition(s) and all the precursor partition(s) needed to support this one + databaseCursor: as always + iterator: Supplies at least one unique identifier (a date). If more than one then more than one (family of) + partition(s) is created + side effects: The cursor's connection will be rolled back or committed by the end of this method + """ + self.logger.debug("%s - in createPartitions", threading.currentThread().getName()) + partitionTableClasses = getOrderedPartitionList([self.__class__]) + #self.logger.debug("DEBUG - Classes are %s",partitionTableClasses) + uniqueItems = [x for x in iterator] + for tableClass in partitionTableClasses: + tableObject = self + if not self.__class__ == tableClass: + tableObject = tableClass(logger = self.logger) + #self.logger.debug("DEBUG - Handling %s /w/ sql %s",tableObject.name,tableObject.partitionCreationSqlTemplate) + tableObject._createOwnPartition(databaseCursor,uniqueItems) + + #----------------------------------------------------------------------------------------------------------------- + def partitionCreationParameters(self,partitioningData): + """returns: a dictionary of string substitution parameters""" + return {} + #----------------------------------------------------------------------------------------------------------------- + def updateColumnDefinitions(self, databaseCursor): + childTableList = socorro_pg.childTablesForTable(self.name, databaseCursor) + for aChildTableName in childTableList: + databaseCursor.execute("alter table %s no inherit %s", (aTable, aChildTableName)) + self.alterColumnDefinitions(databaseCursor, self.name) + for aChildTableName in childTableList: + self.alterColumnDefinitions(databaseCursor, aChildTableName) + for aChildTableName in childTableList: + databaseCursor.execute("alter table %s inherit %s", (aTable, aChildTableName)) + #----------------------------------------------------------------------------------------------------------------- + def insert(self, databaseCursor, row, alternateCursorFunction, **kwargs): + try: + uniqueIdentifier = kwargs["date_processed"] + except KeyError: + raise PartitionControlParameterRequired() + dateRangeTuple = mondayPairsIteratorFactory(uniqueIdentifier, uniqueIdentifier).next()# create iterator and throw away + partitionName = self.partitionCreationParameters(dateRangeTuple)["partitionName"] + insertSql = self.insertSql.replace('TABLENAME', partitionName) + try: + databaseCursor.execute("savepoint %s" % partitionName) + #self.logger.debug("%s - Trying to insert into %s", threading.currentThread().getName(), self.name) + databaseCursor.execute(insertSql, row) + databaseCursor.execute("release savepoint %s" % partitionName) + except pg.ProgrammingError, x: + self.logger.debug('%s - Rolling back and releasing savepoint: failed: %s', threading.currentThread().getName(), str(x).strip()) + databaseCursor.execute("rollback to %s; release savepoint %s;" % (partitionName, partitionName)) + databaseCursor.connection.commit() # This line added after of hours of blood, sweat, tears. Remove only per deathwish. + + altConnection, altCursor = alternateCursorFunction() + dateIterator = mondayPairsIteratorFactory(uniqueIdentifier, uniqueIdentifier) + try: + self.createPartitions(altCursor,dateIterator) + except pg.DatabaseError,x: + self.logger.debug("%s - Failed to create partition(s) %s: %s:%s", threading.currentThread().getName(), partitionName, type(x), x) + self.logger.debug("%s - trying to insert into %s for the second time", threading.currentThread().getName(), self.name) + databaseCursor.execute(insertSql, row) + +#================================================================================================================= +class ReportsTable(PartitionedTable): + """Define the table 'reports'""" + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(ReportsTable, self).__init__(name='reports', logger=logger, + creationSql=""" + CREATE TABLE reports ( + id serial NOT NULL, + client_crash_date timestamp with time zone, + date_processed timestamp with time zone, + uuid character varying(50) NOT NULL, + product character varying(30), + version character varying(16), + build character varying(30), + signature character varying(255), + url character varying(255), + install_age integer, + last_crash integer, + uptime integer, + cpu_name character varying(100), + cpu_info character varying(100), + reason character varying(255), + address character varying(20), + os_name character varying(100), + os_version character varying(100), + email character varying(100), + user_id character varying(50), + started_datetime timestamp with time zone, + completed_datetime timestamp with time zone, + success boolean, + truncated boolean, + processor_notes text, + user_comments character varying(1024), + app_notes character varying(1024), + distributor character varying(20), + distributor_version character varying(20), + topmost_filenames TEXT, + addons_checked boolean, + flash_version TEXT, + hangid TEXT, + process_type TEXT, + release_channel TEXT + ); + --CREATE TRIGGER reports_insert_trigger + -- BEFORE INSERT ON reports + -- FOR EACH ROW EXECUTE PROCEDURE partition_insert_trigger();""", + partitionCreationSqlTemplate=""" + CREATE TABLE %(partitionName)s ( + CONSTRAINT %(partitionName)s_date_check CHECK (TIMESTAMP with time zone '%(startDate)s UTC' <= date_processed and date_processed < TIMESTAMP with time zone '%(endDate)s UTC'), + CONSTRAINT %(partitionName)s_unique_uuid unique (uuid), + PRIMARY KEY(id) + ) + INHERITS (reports); + CREATE INDEX %(partitionName)s_date_processed_key ON %(partitionName)s (date_processed); + CREATE INDEX %(partitionName)s_uuid_key ON %(partitionName)s (uuid); + CREATE INDEX %(partitionName)s_url_key ON %(partitionName)s (url); + CREATE INDEX %(partitionName)s_build_key ON %(partitionName)s (build); + CREATE INDEX %(partitionName)s_product_version_key ON %(partitionName)s (product, version); + CREATE INDEX %(partitionName)s_signature_date_processed_build_key ON %(partitionName)s (signature, date_processed, build); + CREATE INDEX %(partitionName)s_hangid_idx ON %(partitionName)s (hangid); + CREATE INDEX %(partitionName)s_reason ON %(partitionName)s (reason); + """ + ) + self.columns = ("uuid", "client_crash_date", "date_processed", "product", "version", "build", "url", "install_age", "last_crash", "uptime", "email", "user_id", "user_comments", "app_notes", "distributor", "distributor_version", "topmost_filenames", "addons_checked", "flash_version", "hangid", "process_type", "release_channel") + self.insertSql = """insert into TABLENAME + (uuid, client_crash_date, date_processed, product, version, build, url, install_age, last_crash, uptime, email, user_id, user_comments, app_notes, distributor, distributor_version, topmost_filenames, addons_checked, flash_version, hangid, process_type, release_channel) values + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" + #----------------------------------------------------------------------------------------------------------------- + def additionalCreationProcedures(self, databaseCursor): + pass + #----------------------------------------------------------------------------------------------------------------- + def partitionCreationParameters(self, uniqueIdentifier): + startDate, endDate = uniqueIdentifier + startDateAsString = "%4d-%02d-%02d" % startDate.timetuple()[:3] + compressedStartDateAsString = startDateAsString.replace("-", "") + endDateAsString = "%4d-%02d-%02d" % endDate.timetuple()[:3] + return { "partitionName": "reports_%s" % compressedStartDateAsString, + "startDate": startDateAsString, + "endDate": endDateAsString, + "compressedStartDate": compressedStartDateAsString + } + #----------------------------------------------------------------------------------------------------------------- + def alterColumnDefinitions(self, databaseCursor, tableName): + columnNameTypeDictionary = socorro_pg.columnNameTypeDictionaryForTable(tableName, databaseCursor) + + #----------------------------------------------------------------------------------------------------------------- + def updateDefinition(self, databaseCursor): + databaseCursor.execute("""DROP RULE IF EXISTS rule_reports_partition ON reports;""") + self.updateColumnDefinitions(databaseCursor) + indexesList = socorro_pg.indexesForTable(self.name, databaseCursor) + +databaseDependenciesForSetup[ReportsTable] = [] + +#================================================================================================================= +class PriorityJobsTable(Table): + """Define the table 'priorityjobs'""" + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, name="priorityjobs", logger=None, **kwargs): + super(PriorityJobsTable, self).__init__(name=name, logger=logger, + creationSql = """ + CREATE TABLE %s ( + uuid varchar(255) NOT NULL PRIMARY KEY + );""" % name) +databaseDependenciesForSetup[PriorityJobsTable] = [] + +#================================================================================================================= +class BugsTable(Table): + """Define the table 'bug_associations'""" + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(BugsTable, self).__init__(name = "bugs", logger=logger, + creationSql = """ + CREATE TABLE bugs ( + id int NOT NULL, + status text, + resolution text, + short_desc text + ); + ALTER TABLE ONLY bugs + ADD CONSTRAINT bugs_pkey PRIMARY KEY (id); + """) + #----------------------------------------------------------------------------------------------------------------- + def updateDefinition(self, databaseCursor): + if socorro_pg.tablesMatchingPattern(self.name) == []: + #this table doesn't exist yet, create it + self.create(databaseCursor) + +databaseDependenciesForSetup[BugsTable] = [] + +#================================================================================================================= +class BugAssociationsTable(Table): + """Define the table 'bug_associations'""" + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(BugAssociationsTable, self).__init__(name = "bug_associations", logger=logger, + creationSql = """ + CREATE TABLE bug_associations ( + signature text NOT NULL, + bug_id int NOT NULL + ); + ALTER TABLE ONLY bug_associations + ADD CONSTRAINT bug_associations_pkey PRIMARY KEY (signature, bug_id); + CREATE INDEX idx_bug_associations_bug_id ON bug_associations (bug_id); + ALTER TABLE bug_associations + ADD CONSTRAINT bug_associations_bug_id_fkey FOREIGN KEY (bug_id) REFERENCES bugs(id) ON DELETE CASCADE; + """) + #----------------------------------------------------------------------------------------------------------------- + def updateDefinition(self, databaseCursor): + if socorro_pg.tablesMatchingPattern(self.name) == []: + #this table doesn't exist yet, create it + self.create(databaseCursor) + +databaseDependenciesForSetup[BugAssociationsTable] = [BugsTable] + + +#================================================================================================================= +class ServerStatusTable(Table): + """Define the table 'server_status'""" + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(ServerStatusTable, self).__init__(name='server_status', logger=logger, + creationSql=""" + CREATE TABLE server_status ( + id serial NOT NULL, + date_recently_completed timestamp with time zone, + date_oldest_job_queued timestamp with time zone, + avg_process_sec real, + avg_wait_sec real, + waiting_job_count integer NOT NULL, + processors_count integer NOT NULL, + date_created timestamp with time zone NOT NULL + ); + ALTER TABLE ONLY server_status + ADD CONSTRAINT server_status_pkey PRIMARY KEY (id); + CREATE INDEX idx_server_status_date ON server_status USING btree (date_created, id); + """) +databaseDependenciesForSetup[ServerStatusTable] = [] + +#================================================================================================================= +class ReleaseEnum(DatabaseObject): + def __init__(self,logger, **kwargs): + super(ReleaseEnum, self).__init__(name='release_enum', logger=logger, + creationSql="CREATE TYPE release_enum AS ENUM ('major', 'milestone', 'development');" + ) + def drop(self, databaseCursor): + databaseCursor.execute("drop type if exists %s cascade"%self.name) +databaseDependenciesForSetup[ReleaseEnum] = [] + +#================================================================================================================= +class ExtensionsTable(PartitionedTable): + """Define the table 'extensions'""" + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(ExtensionsTable, self).__init__(name='extensions', logger=logger, + creationSql=""" + CREATE TABLE extensions ( + report_id integer NOT NULL, + date_processed timestamp with time zone, + extension_key integer NOT NULL, + extension_id text NOT NULL, + extension_version text + ); + --CREATE TRIGGER extensions_insert_trigger + -- BEFORE INSERT ON extensions + -- FOR EACH ROW EXECUTE PROCEDURE partition_insert_trigger();""", + partitionCreationSqlTemplate=""" + CREATE TABLE %(partitionName)s ( + CONSTRAINT %(partitionName)s_date_check CHECK (TIMESTAMP with time zone '%(startDate)s UTC' <= date_processed and date_processed < TIMESTAMP with time zone '%(endDate)s UTC'), + PRIMARY KEY (report_id, extension_key) + ) + INHERITS (extensions); + CREATE INDEX %(partitionName)s_report_id_date_key ON %(partitionName)s (report_id, date_processed, extension_key); + CREATE INDEX %(partitionName)s_extension_id_extension_version_idx ON %(partitionName)s (extension_id, extension_version); + ALTER TABLE %(partitionName)s + ADD CONSTRAINT %(partitionName)s_report_id_fkey FOREIGN KEY (report_id) REFERENCES reports_%(compressedStartDate)s(id) ON DELETE CASCADE; + """) + self.insertSql = """insert into TABLENAME (report_id, date_processed, extension_key, extension_id, extension_version) values (%s, %s, %s, %s, %s)""" + #----------------------------------------------------------------------------------------------------------------- + def alterColumnDefinitions(self, databaseCursor, tableName): + columnNameTypeDictionary = socorro_pg.columnNameTypeDictionaryForTable(tableName, databaseCursor) + #if 'date_processed' not in columnNameTypeDictionary: + #databaseCursor.execute("""ALTER TABLE %s + #ADD COLUMN date_processed TIMESTAMP without time zone;""" % tableName) + #----------------------------------------------------------------------------------------------------------------- + def updateDefinition(self, databaseCursor): + self.updateColumnDefinitions(databaseCursor) + indexesList = socorro_pg.indexesForTable(self.name, databaseCursor) + #if 'extensions_pkey' in indexesList: + #databaseCursor.execute("""ALTER TABLE extensions + #DROP CONSTRAINT extensions_pkey;""") + #databaseCursor.execute("""DROP RULE IF EXISTS rule_extensions_partition ON extensions;""") + #triggersList = socorro_pg.triggersForTable(self.name, databaseCursor) + #if 'extensions_insert_trigger' not in triggersList: + #databaseCursor.execute("""CREATE TRIGGER extensions_insert_trigger + #BEFORE INSERT ON extensions + #FOR EACH ROW EXECUTE PROCEDURE partition_insert_trigger();""") + #----------------------------------------------------------------------------------------------------------------- + def partitionCreationParameters(self, uniqueIdentifier): + startDate, endDate = uniqueIdentifier + startDateAsString = "%4d-%02d-%02d" % startDate.timetuple()[:3] + compressedStartDateAsString = startDateAsString.replace("-", "") + endDateAsString = "%4d-%02d-%02d" % endDate.timetuple()[:3] + return { "partitionName": "extensions_%s" % compressedStartDateAsString, + "startDate": startDateAsString, + "endDate": endDateAsString, + "compressedStartDate": compressedStartDateAsString + } +databaseDependenciesForPartition[ExtensionsTable] = [ReportsTable] +databaseDependenciesForSetup[ExtensionsTable] = [] + +#================================================================================================================= +class FramesTable(PartitionedTable): + """Define the table 'frames'""" + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(FramesTable, self).__init__(name='frames', logger=logger, + creationSql=""" + CREATE TABLE frames ( + report_id integer NOT NULL, + date_processed timestamp with time zone, + frame_num integer NOT NULL, + signature varchar(255) + );""", + partitionCreationSqlTemplate=""" + CREATE TABLE %(partitionName)s ( + CONSTRAINT %(partitionName)s_date_check CHECK (TIMESTAMP with time zone '%(startDate)s UTC' <= date_processed and date_processed < TIMESTAMP with time zone '%(endDate)s UTC'), + PRIMARY KEY (report_id, frame_num) + ) + INHERITS (frames); + CREATE INDEX %(partitionName)s_report_id_date_key ON %(partitionName)s (report_id, date_processed); + ALTER TABLE %(partitionName)s + ADD CONSTRAINT %(partitionName)s_report_id_fkey FOREIGN KEY (report_id) REFERENCES reports_%(compressedStartDate)s(id) ON DELETE CASCADE; + """ + ) + self.insertSql = """insert into TABLENAME (report_id, frame_num, date_processed, signature) values (%s, %s, %s, %s)""" + #----------------------------------------------------------------------------------------------------------------- + def alterColumnDefinitions(self, databaseCursor, tableName): + columnNameTypeDictionary = socorro_pg.columnNameTypeDictionaryForTable(tableName, databaseCursor) + + #----------------------------------------------------------------------------------------------------------------- + def updateDefinition(self, databaseCursor): + self.updateColumnDefinitions(databaseCursor) + indexesList = socorro_pg.indexesForTable(self.name, databaseCursor) + + #----------------------------------------------------------------------------------------------------------------- + def partitionCreationParameters(self, uniqueIdentifier): + startDate, endDate = uniqueIdentifier + startDateAsString = "%4d-%02d-%02d" % startDate.timetuple()[:3] + compressedStartDateAsString = startDateAsString.replace("-", "") + endDateAsString = "%4d-%02d-%02d" % endDate.timetuple()[:3] + return { "partitionName": "frames_%s" % compressedStartDateAsString, + "startDate": startDateAsString, + "endDate": endDateAsString, + "compressedStartDate": compressedStartDateAsString + } +databaseDependenciesForPartition[FramesTable] = [ReportsTable] +databaseDependenciesForSetup[FramesTable] = [] + +#================================================================================================================= +class PluginsTable(Table): + """Define the table 'plugins'""" + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, name="plugins", logger=None, **kwargs): + super(PluginsTable, self).__init__(name=name, logger=logger, + creationSql = """ + CREATE TABLE %s ( + id SERIAL NOT NULL, + filename TEXT NOT NULL, + name TEXT NOT NULL, + PRIMARY KEY (id), + CONSTRAINT filename_name_key UNIQUE (filename, name) + );""" % name) + + def insert(self, databaseCursor, rowTuple=None): + databaseCursor.execute("insert into plugins (filename, name) values (%s, %s)", rowTuple) + +databaseDependenciesForSetup[PluginsTable] = [] + +#================================================================================================================= +class PluginsReportsTable(PartitionedTable): + """Define the table 'plugins_reports'""" + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(PluginsReportsTable, self).__init__(name='plugins_reports', logger=logger, + creationSql=""" + CREATE TABLE plugins_reports ( + report_id integer NOT NULL, + plugin_id integer NOT NULL, + date_processed timestamp with time zone, + version TEXT NOT NULL + );""", + + partitionCreationSqlTemplate=""" + CREATE TABLE %(partitionName)s ( + CONSTRAINT %(partitionName)s_date_check CHECK (TIMESTAMP with time zone '%(startDate)s UTC' <= date_processed and date_processed < TIMESTAMP with time zone '%(endDate)s UTC'), + PRIMARY KEY (report_id, plugin_id) + ) + INHERITS (plugins_reports); + CREATE INDEX %(partitionName)s_report_id_date_key ON %(partitionName)s (report_id, date_processed, plugin_id); + ALTER TABLE %(partitionName)s + ADD CONSTRAINT %(partitionName)s_report_id_fkey FOREIGN KEY (report_id) REFERENCES reports_%(compressedStartDate)s(id) ON DELETE CASCADE; + ALTER TABLE %(partitionName)s + ADD CONSTRAINT %(partitionName)s_plugin_id_fkey FOREIGN KEY (plugin_id) REFERENCES plugins(id) ON DELETE CASCADE; + """) + self.insertSql = """insert into TABLENAME (report_id, plugin_id, date_processed, version) values + (%s, %s, %s, %s)""" + #----------------------------------------------------------------------------------------------------------------- + def partitionCreationParameters(self, uniqueIdentifier): + startDate, endDate = uniqueIdentifier + startDateAsString = "%4d-%02d-%02d" % startDate.timetuple()[:3] + compressedStartDateAsString = startDateAsString.replace("-", "") + endDateAsString = "%4d-%02d-%02d" % endDate.timetuple()[:3] + return { "partitionName": "plugins_reports_%s" % compressedStartDateAsString, + "startDate": startDateAsString, + "endDate": endDateAsString, + "compressedStartDate": compressedStartDateAsString + } +databaseDependenciesForPartition[PluginsReportsTable] = [ReportsTable] +databaseDependenciesForSetup[PluginsReportsTable] = [PluginsTable] + + +#================================================================================================================= +class ReleasesRawTable(Table): + """Define the table 'releases_raw'""" + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(ReleasesRawTable, self).__init__(name = "releases_raw", logger=logger, + creationSql = """ + CREATE TABLE releases_raw ( + product_name citext not null, + version text, + platform text, + build_id numeric, + build_type text, + beta_number int, + repository text + ); + """) + self.insertSql = """INSERT INTO TABLENAME + (product, version, platform, buildid, buildtype, + beta_number, repository) + VALUES (%s, %s, %s, %s, %s, %s, %s)""" + + #----------------------------------------------------------------------------------------------------------------- + def updateDefinition(self, databaseCursor): + if socorro_pg.tablesMatchingPattern(self.name) == []: + #this table doesn't exist yet, create it + self.create(databaseCursor) + +databaseDependenciesForSetup[ReleasesRawTable] = [] + +#================================================================================================================= +class EmailCampaignsTable(Table): + """Define the table 'email_campaigns' + Notes: * email_count is populated after the record is inserted (TBD) + * product/versions is denormalized to record versions used, but isn't searchable + """ + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(EmailCampaignsTable, self).__init__(name = "email_campaigns", logger=logger, + creationSql = """ + CREATE TABLE email_campaigns ( + id serial NOT NULL PRIMARY KEY, + product TEXT NOT NULL, + versions TEXT NOT NULL, + signature TEXT NOT NULL, + subject TEXT NOT NULL, + body TEXT NOT NULL, + start_date timestamp with time zone NOT NULL, + end_date timestamp with time zone NOT NULL, + email_count INTEGER DEFAULT 0, + author TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'stopped', + date_created timestamp with time zone NOT NULL DEFAULT now()); + CREATE INDEX email_campaigns_product_signature_key ON email_campaigns (product, signature); + """) + self.insertSql = """INSERT INTO email_campaigns (product, versions, signature, subject, body, start_date, end_date, email_count, author) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id""" + + #----------------------------------------------------------------------------------------------------------------- + def updateDefinition(self, databaseCursor): + if socorro_pg.tablesMatchingPattern(self.name) == []: + #this table doesn't exist yet, create it + self.create(databaseCursor) + +databaseDependenciesForSetup[EmailCampaignsTable] = [] + +#================================================================================================================= +class EmailContactsTable(Table): + """Define the table 'email_contacts' + Notes: subscribe_token - UUID which is used in urls for a user to manage their subscription. + subscribe_status - Captures user's opt-out status. True - we can email, False - no email + """ + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(EmailContactsTable, self).__init__(name = "email_contacts", logger=logger, + creationSql = """ + CREATE TABLE email_contacts ( + id serial NOT NULL PRIMARY KEY, + email TEXT NOT NULL, + subscribe_token TEXT NOT NULL, + subscribe_status BOOLEAN DEFAULT TRUE, + ooid TEXT NOT NULL, + crash_date TIMESTAMP with time zone, + CONSTRAINT email_contacts_email_unique UNIQUE (email), + CONSTRAINT email_contacts_token_unique UNIQUE (subscribe_token) + ); + """) + self.insertSql = """INSERT INTO email_contacts (email, subscribe_token, ooid, crash_date) VALUES (%s, %s, %s, %s) RETURNING id""" + #----------------------------------------------------------------------------------------------------------------- + def updateDefinition(self, databaseCursor): + if socorro_pg.tablesMatchingPattern(self.name) == []: + #this table doesn't exist yet, create it + self.create(databaseCursor) + +databaseDependenciesForSetup[EmailContactsTable] = [] + +#================================================================================================================= +class EmailCampaignsContactsTable(Table): + """Define the table 'email_campaigns_contacts' + Notes: Mapping table many to many + Tracks status of emails to-be-sent + """ + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(EmailCampaignsContactsTable, self).__init__(name = "email_campaigns_contacts", logger=logger, + creationSql = """ + CREATE TABLE email_campaigns_contacts ( + email_campaigns_id INTEGER REFERENCES email_campaigns (id), + email_contacts_id INTEGER REFERENCES email_contacts (id), + -- status will be ready, allocated to mailer _mailerid, sent, or failed (return code) + status TEXT NOT NULL DEFAULT 'ready', + CONSTRAINT email_campaigns_contacts_mapping_unique UNIQUE (email_campaigns_id, email_contacts_id) + ); + """) + self.insertSql = """INSERT INTO email_campaigns_contacts (email_campaigns_id, email_contacts) VALUES (%s, %s) RETURNING id""" + + #----------------------------------------------------------------------------------------------------------------- + def updateDefinition(self, databaseCursor): + if socorro_pg.tablesMatchingPattern(self.name) == []: + #this table doesn't exist yet, create it + self.create(databaseCursor) + +databaseDependenciesForSetup[EmailCampaignsContactsTable] = [EmailCampaignsTable, EmailContactsTable] + +#================================================================================================================= +class ReportsDuplicatesTable(Table): + """Define the table 'reports_duplicates' and related functions + """ + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(ReportsDuplicatesTable, self).__init__(name = "reports_duplicates", logger=logger, + creationSql = """ + -- create table for possible duplicates + -- not partitioned, for now + + create table reports_duplicates ( + uuid text not null primary key, + duplicate_of text not null, + date_processed timestamp not null + ); + + create index reports_duplicates_leader on reports_duplicates(duplicate_of); + + -- SQL function to make comparing timestamp deltas a bit + -- less verbose + + create or replace function same_time_fuzzy( + date1 timestamptz, date2 timestamptz, + interval_secs1 int, interval_secs2 int + ) returns boolean + language sql as $f$ + SELECT + -- return true if either interval is null + -- so we don't exclude crashes missing data + CASE WHEN $3 IS NULL THEN + TRUE + WHEN $4 IS NULL THEN + TRUE + -- otherwise check that the two timestamp deltas + -- and the two interval deltas are within 60 sec + -- of each other + ELSE + ( + extract ('epoch' from ( $2 - $1 ) ) - + ( $4 - $3 ) + ) BETWEEN -60 AND 60 + END; + $f$; + + -- function to be called hourly to update + -- possible duplicates table + + create or replace function update_reports_duplicates ( + start_time timestamp, end_time timestamp ) + returns int + set work_mem = '256MB' + set temp_buffers = '128MB' + language plpgsql as $f$ + declare new_dups INT; + begin + + -- create a temporary table with the new duplicates + -- for the hour + -- this query contains the duplicate-finding algorithm + -- so it will probably change frequently + + create temporary table new_reports_duplicates + on commit drop + as + select follower.uuid as uuid, + leader.uuid as duplicate_of, + follower.date_processed + from + ( + select uuid, + install_age, + uptime, + client_crash_date, + date_processed, + first_value(uuid) + over ( partition by + product, + version, + build, + signature, + cpu_name, + cpu_info, + os_name, + os_version, + address, + topmost_filenames, + reason, + app_notes, + url + order by + client_crash_date, + uuid + ) as leader_uuid + from reports + where date_processed BETWEEN start_time AND end_time + ) as follower + JOIN + ( select uuid, install_age, uptime, client_crash_date + FROM reports + where date_processed BETWEEN start_time AND end_time ) as leader + ON follower.leader_uuid = leader.uuid + WHERE ( same_time_fuzzy(leader.client_crash_date, follower.client_crash_date, + leader.uptime, follower.uptime) + OR follower.uptime < 60 + ) + AND + same_time_fuzzy(leader.client_crash_date, follower.client_crash_date, + leader.install_age, follower.install_age) + AND follower.uuid <> leader.uuid; + + -- insert a copy of the leaders + + insert into new_reports_duplicates + select uuid, uuid, date_processed + from reports + where uuid IN ( select duplicate_of + from new_reports_duplicates ) + and date_processed BETWEEN start_time AND end_time; + + analyze new_reports_duplicates; + + select count(*) into new_dups from new_reports_duplicates; + + -- insert new duplicates into permanent table + + insert into reports_duplicates (uuid, duplicate_of, date_processed ) + select new_reports_duplicates.* + from new_reports_duplicates + left outer join reports_duplicates USING (uuid) + where reports_duplicates.uuid IS NULL; + + -- done return number of dups found and exit + RETURN new_dups; + end;$f$; + """) + +databaseDependenciesForSetup[ReportsDuplicatesTable] = [] + +#================================================================================================================= +class ProductIdMapTable(Table): + """Define the table 'product_productid_map' + Notes: Provides override mapping for product name based on productID + """ + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(ProductIdMapTable, self).__init__(name = "product_productid_map", logger=logger, + creationSql = """ + CREATE TABLE product_productid_map ( + product_name citext NOT NULL, + productid text NOT NULL, + rewrite boolean NOT NULL DEFAULT FALSE, + version_began numeric NOT NULL, + version_ended numeric + ); + """) + self.insertSql = """INSERT INTO product_productid_map (product_name, productid, rewrite, version_began, + version_ended) values (%s, %s, %s, %s, %s)""" + +databaseDependenciesForSetup[ProductIdMapTable] = [] + + +#================================================================================================================= +class TransformRules(Table): + """a single source for transformation rules based on the TransformRules classes + """ + #----------------------------------------------------------------------------------------------------------------- + def __init__ (self, logger, **kwargs): + super(TransformRules, self).__init__(name = "transform_rules", logger=logger, + creationSql = """ + CREATE TABLE transform_rules ( + transform_rule_id SERIAL NOT NULL PRIMARY KEY, + category CITEXT NOT NULL, + rule_order INT NOT NULL, + predicate TEXT NOT NULL DEFAULT '', + predicate_args TEXT NOT NULL DEFAULT '', + predicate_kwargs TEXT NOT NULL DEFAULT '', + action TEXT NOT NULL DEFAULT '', + action_args TEXT NOT NULL DEFAULT '', + action_kwargs TEXT NOT NULL DEFAULT '', + constraint transform_rules_key UNIQUE (category, rule_order) + DEFERRABLE INITIALLY DEFERRED + ); + """) + self.insertSql = """INSERT INTO transform_rules (category, predicate, predicate_args, predicate_kwargs, + action, action_args, action_args) values (%s, %s, %s, %s, %s)""" + +databaseDependenciesForSetup[TransformRules] = [] + + +#----------------------------------------------------------------------------------------------------------------- +def connectToDatabase(config, logger): + databaseDSN = "host=%(database_hostname)s dbname=%(database_name)s user=%(database_username)s password=%(database_password)s" % config + databaseConnection = pg.connect(databaseDSN) + #databaseCursor = databaseConnection.cursor(cursor_factory=socorro_psy.LoggingCursor) + #databaseCursor.setLogger(logger) + databaseCursor = databaseConnection.cursor() + return (databaseConnection, databaseCursor) + +#----------------------------------------------------------------------------------------------------------------- +def setupDatabase(config, logger): + databaseConnection, databaseCursor = connectToDatabase(config, logger) + + try: + databaseCursor.execute("CREATE LANGUAGE plpgsql") + except: + databaseConnection.rollback() + + try: + databaseCursor.execute("CREATE EXTENSION citext") + except: + databaseConnection.rollback() + + try: + for aDatabaseObjectClass in getOrderedSetupList(): + aDatabaseObject = aDatabaseObjectClass(logger=logger) + aDatabaseObject._createSelf(databaseCursor) + databaseConnection.commit() + except Exception,x: + databaseConnection.rollback() + socorro_util.reportExceptionAndAbort(logger) + +#----------------------------------------------------------------------------------------------------------------- +def teardownDatabase(config,logger): + global partitionCreationHistory + databaseConnection,databaseCursor = connectToDatabase(config,logger) + try: + for databaseObjectClass in getOrderedSetupList(): + aDatabaseObject = databaseObjectClass(logger=logger) + aDatabaseObject.drop(databaseCursor) + databaseConnection.commit() + partitionCreationHistory = set() + except: + databaseConnection.rollback() + socorro_util.reportExceptionAndContinue(logger) + +#----------------------------------------------------------------------------------------------------------------- +databaseObjectClassListForUpdate = [#ReportsTable, + #DumpsTable, + ExtensionsTable, + FramesTable, + ] +#----------------------------------------------------------------------------------------------------------------- +def updateDatabase(config, logger): + databaseConnection, databaseCursor = connectToDatabase(config, logger) + try: + #try: + #databaseCursor.execute("CREATE LANGUAGE plpythonu") + #except: + #databaseConnection.rollback() + for aDatabaseObjectClass in databaseObjectClassListForUpdate: + aDatabaseObject = aDatabaseObjectClass(logger=logger) + aDatabaseObject.updateDefinition(databaseCursor) + databaseConnection.commit() + except: + databaseConnection.rollback() + socorro_util.reportExceptionAndAbort(logger) + +#----------------------------------------------------------------------------------------------------------------- +# list all the tables that should have weekly partitions pre-created. This is a subclass of all the PartitionedTables +# since it may be that some PartitionedTables should not be pre-created. +databaseObjectClassListForWeeklyPartitions = [ReportsTable, + #DumpsTable, + FramesTable, + ExtensionsTable, + PluginsReportsTable, + ] +#----------------------------------------------------------------------------------------------------------------- +def createPartitions(config, logger): + """ + Create a set of partitions for all the tables known to be efficient when they are created prior to being needed. + see the list databaseObjectClassListForWeeklyParitions above + """ + databaseConnection, databaseCursor = connectToDatabase(config, logger) + try: + for aDatabaseObjectClass in databaseObjectClassListForWeeklyPartitions: + weekIterator = mondayPairsIteratorFactory(config.startDate, config.endDate) + aDatabaseObject = aDatabaseObjectClass(logger=logger) + aDatabaseObject.createPartitions(databaseCursor, weekIterator) + databaseConnection.commit() + except: + databaseConnection.rollback() + socorro_util.reportExceptionAndAbort(logger) + diff --git a/socorro/external/filesystem/__init__.py b/socorro/external/filesystem/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/socorro/external/filesystem/crash_data.py b/socorro/external/filesystem/crash_data.py new file mode 100644 index 0000000000..861316026b --- /dev/null +++ b/socorro/external/filesystem/crash_data.py @@ -0,0 +1,15 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from socorro.external.crash_data_base import CrashDataBase + + +class CrashData(CrashDataBase): + + """ + Implement the /crash_data service with the file system. + """ + + def get_storage(self): + return self.config.filesystem.filesystem_class(self.config.filesystem) diff --git a/socorro/external/filesystem/crashstorage.py b/socorro/external/filesystem/crashstorage.py new file mode 100644 index 0000000000..9377ae45e5 --- /dev/null +++ b/socorro/external/filesystem/crashstorage.py @@ -0,0 +1,484 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +"""The classes defined herein store crash data in a file system. This is the +original method of long term storage used by Socorro in the 2007-2010 time +frame prior to the adoption of HBase. Crashes are stored in a radix directory +tree based on pairs of characters from the crashes' crash_id. In addition, a +second tree of directories stores symbolic links to the crashes in a date +based hierarchy. + +There are three classes defined in this file (as of 2012). Each one derives +from the previous and adds capablities. See the doc strings for more detail""" + +import stat +import os +import json +import datetime + +from configman import Namespace + +from socorro.external.filesystem.json_dump_storage import (JsonDumpStorage, + NoSuchUuidFound) +from socorro.external.filesystem.processed_dump_storage import \ + ProcessedDumpStorage +from socorro.external.crashstorage_base import (CrashStorageBase, + CrashIDNotFound) +from socorro.lib.util import DotDict +from socorro.collector.throttler import ACCEPT + + +#============================================================================== +class FileSystemRawCrashStorage(CrashStorageBase): + """This crash storage class impements only the raw crash part of the + api. Raw crashes (the json file and the binary dump) are stored in a + file system. This class is appropriate for fast storage of crashes into + a local file system. In 2011, a varient of this code base was adopted + by the Socorro Collector for fast temporary storage as crashes came in.""" + + required_config = Namespace() + required_config.add_option( + 'std_fs_root', + doc='a path to a local file system', + default='./primaryCrashStore', + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'dump_dir_count', + doc='the number of dumps to be stored in a single directory in the ' + 'local file system', + default=1024, + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'dump_gid', + doc='the group ID for saved crashes in local file system (optional)', + default='', + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'dump_permissions', + doc='a number used for permissions crash dump files in the local ' + 'file system', + default=stat.S_IRGRP | stat.S_IWGRP | stat.S_IRUSR | stat.S_IWUSR, + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'dir_permissions', + doc='a number used for permissions for directories in the local ' + 'file system', + default=(stat.S_IRGRP | stat.S_IXGRP | stat.S_IWGRP | stat.S_IRUSR + | stat.S_IXUSR | stat.S_IWUSR), + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'json_file_suffix', + doc='the suffix used to identify a json file', + default='.json', + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'dump_file_suffix', + doc='the suffix used to identify a dump file', + default='.dump', + reference_value_from='resource.filesystem', + ) + + #-------------------------------------------------------------------------- + def __init__(self, config, quit_check_callback=None): + super(FileSystemRawCrashStorage, self).__init__(config) + self.std_crash_store = JsonDumpStorage( + root=config.std_fs_root, + maxDirectoryEntries=config.dump_dir_count, + jsonSuffix=config.json_file_suffix, + dumpSuffix=config.dump_file_suffix, + dumpGID=config.dump_gid, + dumpPermissions=config.dump_permissions, + dirPermissions=config.dir_permissions, + logger=config.logger + ) + self.hostname = os.uname()[1] + + #-------------------------------------------------------------------------- + def _load_raw_crash_from_file(self, pathname): + with open(pathname) as json_file: + raw_crash = json.load(json_file, object_hook=DotDict) + return raw_crash + + #-------------------------------------------------------------------------- + def _do_save_raw(self, + json_storage_system, + raw_crash, + dumps, + crash_id): + json_storage_system.new_entry( + crash_id, + raw_crash, + dumps, + self.hostname + ) + + #-------------------------------------------------------------------------- + def save_raw_crash(self, raw_crash, dumps, crash_id): + """forward the raw_crash and the dump to the underlying file system""" + self._do_save_raw(self.std_crash_store, raw_crash, dumps, crash_id) + + def save_raw_and_processed(self, raw_crash, dumps, processed_crash, crash_id): + """ bug 866973 - do not try to save dumps=None into the Filesystem + We are doing this in lieu of a queuing solution that could allow + us to operate an independent crashmover. When the queuing system + is implemented, we could remove this, and have the raw crash + saved by a crashmover that's consuming crash_ids the same way + that the processor consumes them. + + Even though it is ok to resave the raw_crash in this case to the + filesystem, the fs does not know what to do with a dumps=None + when passed to save_raw, so we are going to avoid that. + """ + self.save_processed(processed_crash) + + #-------------------------------------------------------------------------- + def get_raw_crash(self, crash_id): + """fetch the raw crash from the underlying file system""" + try: + pathname = self.std_crash_store.getJson(crash_id) + return self._load_raw_crash_from_file(pathname) + except OSError: + raise CrashIDNotFound(crash_id) + except ValueError: # empty json file? + return DotDict() + + #-------------------------------------------------------------------------- + def get_raw_dump(self, crash_id, name=None): + """read the binary crash dump from the underlying file system by + getting the pathname and then opening and reading the file.""" + try: + job_pathname = self.std_crash_store.getDump(crash_id, name) + with open(job_pathname) as dump_file: + binary = dump_file.read() + return binary + except OSError: + raise CrashIDNotFound(crash_id) + + #-------------------------------------------------------------------------- + def _do_get_raw_dumps(self, crash_id, crash_store): + try: + dumpname_paths_map = crash_store.get_dumps(crash_id) + dumpname_dump_map = {} + for dump_name, dump_pathname in dumpname_paths_map.iteritems(): + with open(dump_pathname, 'rb') as f: + dumpname_dump_map[dump_name] = f.read() + return dumpname_dump_map + except OSError: + raise CrashIDNotFound(crash_id) + + #-------------------------------------------------------------------------- + def get_raw_dumps(self, crash_id): + """read the all the binary crash dumps from the underlying file system + by getting the pathnames and then opening and reading the files. + returns a dict of dump names to binary dumps""" + return self._do_get_raw_dumps(crash_id, self.std_crash_store) + + #-------------------------------------------------------------------------- + def get_raw_dumps_as_files(self, crash_id): + """read the all the binary crash dumps from the underlying file system + by getting the pathnames and then opening and reading the files. + returns a dict of dump names to binary dumps""" + return self.std_crash_store.get_dumps(crash_id) + + #-------------------------------------------------------------------------- + def new_crashes(self): + """return an iterator that yields a list of crash_ids of raw crashes + that were added to the file system since the last time this iterator + was requested.""" + # why is this called 'destructiveDateWalk'? The underlying code + # that manages the filesystem uses a tree of radix date directories + # and symbolic links to track "new" raw crashes. As the the crash_ids + # are fetched from the file system, the symbolic links are removed and + # directories are deleted. Essentially, the state of what is + # considered to be new is saved within the file system by those links. + return self.std_crash_store.destructiveDateWalk() + + #-------------------------------------------------------------------------- + def remove(self, crash_id): + """delegate removal of a raw crash to the underlying filesystem""" + try: + self.std_crash_store.quickDelete(crash_id) + except NoSuchUuidFound: + raise CrashIDNotFound(crash_id) + + +#============================================================================== +class FileSystemThrottledCrashStorage(FileSystemRawCrashStorage): + """This varient of file system storage segregates crashes based on + the result of Collector throttling. When the collector recieves a crash, + it applies throttle rules and saves the result in the crash json under the + key 'legacy_processing'. Only crashes that have a value of 0 in that field + will eventually make it on to processing. + legacy_processing == 0 : crashes stored in the filesystem rooted at + 'std_fs_root' (standard file system storage) + defined in the parent class + legacy_processing == 1 : crashes stored in the filesysetm rooted at + 'def_fs_root' (deferred file system storage) + defined in this class + This class only implements raw crash storage and is not appropriate for + storing processed crashes.""" + + required_config = Namespace() + required_config.add_option( + 'def_fs_root', + doc='a path to a local file system', + default='./deferredCrashStore', + reference_value_from='resource.filesystem', + ) + + #-------------------------------------------------------------------------- + def __init__(self, config, quit_check_callback=None): + super(FileSystemThrottledCrashStorage, self).__init__(config) + + self.def_crash_store = JsonDumpStorage( + root=config.def_fs_root, + maxDirectoryEntries=config.dump_dir_count, + jsonSuffix=config.json_file_suffix, + dumpSuffix=config.dump_file_suffix, + dumpGID=config.dump_gid, + dumpPermissions=config.dump_permissions, + dirPermissions=config.dir_permissions, + logger=config.logger + ) + self._crash_store_tuple = (self.std_crash_store, + self.def_crash_store) + + #-------------------------------------------------------------------------- + def save_raw_crash(self, raw_crash, dump, crash_id): + """save the raw crash and the dump in the appropriate file system + based on the value of 'legacy_processing' with the raw_crash itself""" + try: + if raw_crash['legacy_processing'] == ACCEPT: + self._do_save_raw( + self.std_crash_store, + raw_crash, + dump, + crash_id + ) + else: + self._do_save_raw( + self.def_crash_store, + raw_crash, + dump, + crash_id + ) + except KeyError: + # if 'legacy_processing' is missing, then it assumed that this + # crash should be processed. Therefore save it into standard + # storage + self._do_save_raw(self.std_crash_store, raw_crash, dump, crash_id) + + #-------------------------------------------------------------------------- + def get_raw_crash(self, crash_id): + """fetch the raw_crash trying each file system in turn""" + for a_crash_store in self._crash_store_tuple: + try: + pathname = a_crash_store.getJson(crash_id) + return self._load_raw_crash_from_file(pathname) + except OSError: + # only raise the exception if we've got no more file systems + # to look through + if a_crash_store is self._crash_store_tuple[-1]: + raise CrashIDNotFound(crash_id) + + #-------------------------------------------------------------------------- + def get_raw_dump(self, crash_id, name=None): + """fetch the dump trying each file system in turn""" + for a_crash_store in self._crash_store_tuple: + try: + job_pathname = a_crash_store.getDump(crash_id, name) + with open(job_pathname) as dump_file: + dump = dump_file.read() + return dump + except OSError: + # only raise the exception if we've got no more file systems + # to look through + if a_crash_store is self._crash_store_tuple[-1]: + raise CrashIDNotFound(crash_id) + + #-------------------------------------------------------------------------- + def get_raw_dumps(self, crash_id): + """fetch the dump trying each file system in turn""" + for a_crash_store in self._crash_store_tuple: + try: + return self._do_get_raw_dumps(crash_id, a_crash_store) + except CrashIDNotFound: + pass # try the next crash store + raise CrashIDNotFound(crash_id) + + #-------------------------------------------------------------------------- + def get_raw_dumps_as_files(self, crash_id): + """fetch the dump trying each file system in turn""" + for a_crash_store in self._crash_store_tuple: + try: + return a_crash_store.get_dumps(crash_id) + except CrashIDNotFound: + pass # try the next crash store + raise CrashIDNotFound(crash_id) + + #-------------------------------------------------------------------------- + def remove(self, crash_id): + """try to remove the raw_crash and the dump from each """ + for a_crash_store in self._crash_store_tuple: + try: + a_crash_store.remove(crash_id) # raises NoSuchUuidFound if + # unsuccessful. + return # break the loop as soon as we succeed + except (NoSuchUuidFound, OSError): + # only raise the exception if we've got no more file systems + # to look through + if a_crash_store is self._crash_store_tuple[-1]: + raise CrashIDNotFound(crash_id) + + +#============================================================================== +class FileSystemCrashStorage(FileSystemThrottledCrashStorage): + """This storage class is the only file system based crash storage system + appropriate for storing both raw and processed crashes. This class uses + the same segregating raw crash storage as the previous class and adds + processed storage. Processed crashes are stored in their own file system + root, 'pro_fs_root' (processed file system root) using the same radix + directory system as the raw crashes.""" + + required_config = Namespace() + required_config.add_option( + 'pro_fs_root', + doc='a path to a local file system for processed storage', + default='./processedCrashStore', + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'minutes_per_slot', + doc='the number of minutes in the lowest date directory', + default=1, + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'sub_slot_count', + doc='distribute data evenly among this many sub timeslots', + default=1, + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'index_name', + doc='the relative path to the top of the name storage tree from ' + 'root parameter', + default='name', + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'date_name', + doc='the relative path to the top of the date storage tree from ' + 'root parameter', + default='date', + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'processed_crash_file_suffix', + doc='the processed crash filename suffix', + default='.jsonz', + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'gzip_compression_level', + doc='the level of compression to use', + default=9, + reference_value_from='resource.filesystem', + ) + required_config.add_option( + 'storage_depth', + doc='the length of branches in the radix storage tree', + default=2, + reference_value_from='resource.filesystem', + ) + + #-------------------------------------------------------------------------- + def __init__(self, config, quit_check_callback=None): + super(FileSystemCrashStorage, self).__init__(config) + self.pro_crash_store = ProcessedDumpStorage( + root=config.pro_fs_root, + minutesPerSlot=config.minutes_per_slot, + subSlotCount=config.sub_slot_count, + indexName=config.index_name, + dateName=config.date_name, + fileSuffix=config.processed_crash_file_suffix, + gzipCompression=config.gzip_compression_level, + storageDepth=config.storage_depth, + dumpGID=config.dump_gid, + dumpPermissions=config.dump_permissions, + dirPermissions=config.dir_permissions, + ) + + #-------------------------------------------------------------------------- + def save_processed(self, processed_crash): + """save a processed crash (in the form of a Mapping) into a json + file. It first gets the underlying file system to give it a file + handle open for writing, then it uses the 'json' module to write + the mapping to the open file handle.""" + try: + crash_id = processed_crash['uuid'] + except KeyError: + raise CrashIDNotFound("uuid missing from processed_crash") + try: + self._stringify_dates_in_dict(processed_crash) + processed_crash_file_handle = \ + self.pro_crash_store.newEntry(crash_id) + try: + json.dump(processed_crash, processed_crash_file_handle) + finally: + processed_crash_file_handle.close() + self.logger.debug('saved processed- %s', crash_id) + except Exception: + self.logger.critical( + 'processed file system storage has failed for: %s', + crash_id, + exc_info=True + ) + raise + + #-------------------------------------------------------------------------- + def get_unredacted_processed(self, crash_id): + """fetch a processed json file from the underlying file system""" + try: + return self.pro_crash_store.getDumpFromFile(crash_id) + except OSError: + raise CrashIDNotFound(crash_id) + + #-------------------------------------------------------------------------- + def remove(self, crash_id): + """remove the all traces of a crash, both raw and processed from the + file system.""" + try: + super(FileSystemCrashStorage, self).remove(crash_id) + except CrashIDNotFound: + self.logger.warning( + 'raw crash not found for deletion: %s', + crash_id + ) + try: + self.pro_crash_store.removeDumpFile(crash_id) + except OSError: + self.logger.warning('processed crash not found for deletion: %s', + crash_id) + + #-------------------------------------------------------------------------- + @staticmethod + def _stringify_dates_in_dict(a_dict): + for name, value in a_dict.iteritems(): + if isinstance(value, datetime.datetime): + a_dict[name] = ("%4d-%02d-%02d %02d:%02d:%02d.%d" % + (value.year, + value.month, + value.day, + value.hour, + value.minute, + value.second, + value.microsecond) + ) diff --git a/socorro/external/filesystem/dump_storage.py b/socorro/external/filesystem/dump_storage.py new file mode 100644 index 0000000000..2897f50c68 --- /dev/null +++ b/socorro/external/filesystem/dump_storage.py @@ -0,0 +1,357 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import datetime +import errno +import logging +import os +from stat import ( + S_IRGRP, S_IXGRP, S_IWGRP, S_IRUSR, S_IXUSR, S_IWUSR, S_IROTH +) + +import socorro.external.filesystem.filesystem as socorro_fs +import socorro.lib.ooid as socorro_ooid +from socorro.lib.datetimeutil import utc_now, UTC + + +class DumpStorage(object): + """ + Base class for storing files that can be quickly accessed based on ooid or + date Note: ooid is a nearly unique identifier assigned external to this + class. See socorro/lib/ooid.py + + The storage system is a tree with multiple daily branches each with two + sub-branches: 'name' and 'date' + + The daily subdirectory is created from first available of: + the timestamp passed to newEntry, + or if None, from the last 6 characters of the ooid + or if the ooid is not date-encoded, from today's date + + Thus, the tree looks like root/YYYYmmDD/name/xx/xx + /date/HH/mm + + Within the 'name' branch, files are located using a radix structure based + on an ooid + The 'name' path is as follows: + - Below the storage root and daily branch is the 'name' directory (can be + configured) + - Below that are one or more subdirectories each named with the next pair + of characters from the ooid + The depth of that path is 2 (as encoded in the ooid when it is generated) + - One or more files may be stored at that location. + Example: For the ooid '4dd21cc0-49d9-46ae-a42b-fadb42090928', the name path + is root/20090928/name/4d/d2 (depth of 2 is encoded at ooid[-7]) + + The 'date' path is as follows: + - Below the storage root and daily branch is the 'date' directory (name can + be configured) + - Below that are 2 subdirectories corresponding to hour and minute-window + - For each stored item, a single symbolic link is stored in the date + directory. + The name of the link is the ooid, the value is a relative path to the + 'name' directory of the ooid + """ + + def __init__(self, root='.', osModule=os, **kwargs): + """ + Take note of our root directory, and override defaults if any in + kwargs: + - dateName overrides 'date' + - indexName overrides 'name' + - minutesPerSlot is the size of each bin in the date path. Default 5 + - dirPermissions sets the permissions for all directories in name and + date paths. Default 'rwxrwx---' + - dumpPermissions sets the permissions for actual stored files (this + class creates no files) + - dumpGID sets the group ID for all directoies in name and date paths. + Default None. + """ + super(DumpStorage, self).__init__() + self.osModule = osModule + self.root = root.rstrip(os.sep) + self.dateName = kwargs.get('dateName', 'date') + self.indexName = kwargs.get('indexName', 'name') + self.minutesPerSlot = int(kwargs.get('minutesPerSlot', 5)) + self.subSlotCount = int(kwargs.get('subSlotCount', 0)) + self.dirPermissions = int(kwargs.get( + 'dirPermissions', + '%d' % (S_IRGRP | S_IXGRP | S_IWGRP | S_IRUSR | S_IXUSR | S_IWUSR)) + ) + self.dumpPermissions = int(kwargs.get( + 'dumpPermissions', + '%d' % (S_IRGRP | S_IWGRP | S_IRUSR | S_IWUSR)) + ) + self.dump_field = kwargs.get('dump_field', 'upload_file_minidump') + self.dumpGID = kwargs.get('dumpGID', None) + try: + if self.dumpGID: + self.dumpGID = int(self.dumpGID) + except ValueError: + if self.dumpGID == '': + self.dumpGID = None + else: + raise + + self.logger = kwargs.get('logger', logging.getLogger('dumpStorage')) + self.currentSubSlots = {} + self.logger.debug( + "Constructor has set the following values:\n" + " self.root: %s\n" + " self.dateName: %s\n" + " self.indexName: %s\n" + " self.minutesPerSlot: %s\n" + " self.subSlotCount: %s\n" + " self.dirPermissions: %o\n" + " self.dumpPermissions: %o\n" + " self.dumpGID: %s\n" % ( + self.root, + self.dateName, + self.indexName, + self.minutesPerSlot, + self.subSlotCount, + self.dirPermissions, + self.dumpPermissions, + self.dumpGID + ) + ) + + def newEntry(self, ooid, timestamp=None, webheadName=None): + """ + Sets up the name and date storage directory branches for the given + ooid. + Creates any needed directories along the path to the appropriate + storage location. Sets gid and mode if specified + Creates one symbolic link in the date leaf directory with name ooid + and referencing the name leaf directory + returns (nameDir,dateDir) + """ + if not timestamp: + timestamp = socorro_ooid.dateFromOoid(ooid) + if not timestamp: + timestamp = utc_now() + if not self.osModule.path.isdir(self.root): + um = self.osModule.umask(0) + try: + self.osModule.mkdir(self.root, self.dirPermissions) + finally: + self.osModule.umask(um) + nameDir, nparts = self.makeNameDir(ooid, timestamp) + dateDir, dparts = self.makeDateDir(timestamp, webheadName) + # adjust the current subslot only when inserting a new entry + if self.subSlotCount: + k = dparts[-1].split('_')[0] + curcount = self.currentSubSlots.setdefault(k, 0) + self.currentSubSlots[k] = (curcount + 1) % self.subSlotCount + parts = [os.path.pardir, ] * (len(dparts) - 2) # lose root / dailypart + parts.append(self.indexName) + parts.extend(self.relativeNameParts(ooid)) + relNameDir = os.sep.join(parts) + try: + self.osModule.symlink(relNameDir, os.path.join(dateDir, ooid)) + except OSError, x: + if errno.ENOENT == x.errno: + # maybe another thread cleaned this out from under us. Try again + nameDir = self.makeNameDir(ooid) # might be overkill, + # but cheap insurance + dateDir = self.makeDateDir(timestamp) + self.osModule.symlink(relNameDir, os.path.join(dateDir, ooid)) + elif errno.EEXIST == x.errno: + self.osModule.unlink(os.path.join(dateDir, ooid)) + self.osModule.symlink(relNameDir, os.path.join(dateDir, ooid)) + else: + raise + if self.dumpGID: + self.osModule.chown(os.path.join(dateDir, ooid), -1, self.dumpGID) + return (nameDir, dateDir) + + def chownGidVisitor(self, path): + """a convenience function""" + self.osModule.chown(path, -1, self.dumpGID) + + def relativeNameParts(self, ooid): + depth = socorro_ooid.depthFromOoid(ooid) + if not depth: + depth = 4 + return [ooid[2 * x: 2 * x + 2] for x in range(depth)] + + def dailyPart(self, ooid, timestamp=None): + """ + return YYYYMMDD + use the timestamp if any, else the ooid's last 6 chars if reasonable, + else now() + """ + year, month, day = None, None, None + if not timestamp: + timestamp = socorro_ooid.dateFromOoid(ooid) + if not timestamp: + timestamp = utc_now() + (year, month, day) = (timestamp.year, timestamp.month, timestamp.day) + return "%4d%02d%02d" % (year, month, day) + + def pathToDate(self, datePath): + """ + Given a path to the date branch leaf node, return a corresponding + datetime.datetime() Note that because of bucketing, the minute will be + no more accurate than the bucket size + """ + # normalize to self.root + if not datePath: + return None + parts = os.path.abspath(datePath).split(os.sep) + root = os.path.split(self.root)[1] + parts = parts[parts.index(root):] + minute = 0 + hour = 0 + try: + minute = int(parts[-1].split('_')[0]) + hour = int(parts[-2]) + except ValueError: + try: + minute = int(parts[-2].split('_')[0]) + hour = int(parts[-3]) + except ValueError: + pass + return datetime.datetime( + int(parts[1][:4]), + int(parts[1][4:6]), + int(parts[1][-2:]), + int(hour), + minute, + tzinfo=UTC + ) + + def lookupNamePath(self, ooid, timestamp=None): + """ + Find an existing name-side directory for the given ooid, return + (dirPath,dirParts) on failure, return (None,[]) + """ + nPath, nParts = self.namePath(ooid, timestamp) + if self.osModule.path.exists(nPath): + return nPath, nParts + else: + dailyDirs = self.osModule.listdir(self.root) + for d in dailyDirs: + nParts[1] = d + path = os.sep.join(nParts) + if self.osModule.path.exists(path): + return path, nParts + return (None, []) + + def namePath(self, ooid, timestamp=None): + """ + Return the path to the directory for this ooid and the directory parts + of the path. Ignores encoded ooid depth, uses depth from __init__ + (default 2) + """ + ooidDay, depth = socorro_ooid.dateAndDepthFromOoid(ooid) + if not depth: + depth = 4 + dirs = [self.root, self.dailyPart(ooid, timestamp), self.indexName] + dirs.extend(self.relativeNameParts(ooid)) + #self.logger.debug( + #"%s - %s -> %s", + #threading.currentThread().getName(), + #ooid, + #dirs + #) + return os.sep.join(dirs), dirs + + def makeNameDir(self, ooid, timestamp=None): + """Make sure the name directory exists, and return its path, and list + of path components. Raises OSError on failure""" + npath, nparts = self.namePath(ooid, timestamp) + #self.logger.debug( + # "%s - trying makedirs %s", + # threading.currentThread().getName(), + # npath + #) + um = self.osModule.umask(0) + try: + try: + socorro_fs.makedirs(npath, self.dirPermissions, self.osModule) + except OSError: + if not self.osModule.path.isdir(npath): + raise + finally: + self.osModule.umask(um) + if self.dumpGID: + socorro_fs.visitPath( + os.path.join(*nparts[:2]), + npath, + self.chownGidVisitor + ) + return npath, nparts + + def lookupOoidInDatePath(self, date, ooid, webheadName=None): + """Look for the date path holding a symbolic link named 'ooid', return + datePath, dateParts on failure return None,[]""" + if not date: + date = socorro_ooid.dateFromOoid(ooid) + if date: + datePath, dateParts = self.datePath(date, webheadName) + if self.osModule.path.exists(os.path.join(datePath, ooid)): + return datePath, dateParts + for d in self.osModule.listdir(self.root): + # We don't know webhead if any, so avoid confusion by looking + # everywhere + for dir, dirs, files in os.walk( + os.sep.join((self.root, d, self.dateName)) + ): + if ooid in dirs or ooid in files: # probably dirs + dirPath = dir + dirParts = dir.split(os.sep) + return dirPath, dirParts + return None, [] + + def datePath(self, date, webheadName=None): + """Return the absolute path to the date subdirectory for the given + date""" + m = date.minute + slot = self.minutesPerSlot * (int(m / self.minutesPerSlot)) + parts = [ + self.root, + self.dailyPart('', date), + self.dateName, + '%02d' % date.hour, + '%02d' % slot + ] + if self.subSlotCount: + if webheadName: + subSlot = self.currentSubSlots.setdefault(webheadName, 0) + parts.append("%s_%d" % (webheadName, subSlot)) + else: + subSlot = self.currentSubSlots.setdefault(slot, 0) + parts[-1] = '%02d_%d' % (slot, subSlot) + return os.sep.join(parts), parts + + def makeDateDir(self, date, webheadName=None): + """Assure existence of date directory for the given date, return path, + and list of components""" + dpath, dparts = self.datePath(date, webheadName) + um = self.osModule.umask(0) + try: + try: + socorro_fs.makedirs(dpath, self.dirPermissions, self.osModule) + except OSError: + if not self.osModule.path.isdir(dpath): + raise + finally: + self.osModule.umask(um) + if self.dumpGID: + socorro_fs.visitPath( + os.path.join(*dparts[:2]), + dpath, + self.chownGidVisitor + ) + return dpath, dparts + + @staticmethod + def readableOrThrow(path): + """ + Throws OSError unless user, group or other has read permission + (Convenience function for derived classes which will all need it) + """ + if not os.stat(path).st_mode & (S_IRUSR | S_IRGRP | S_IROTH): + raise OSError(errno.ENOENT, 'Cannot read %s' % path) diff --git a/socorro/external/fs/filesystem.py b/socorro/external/filesystem/filesystem.py similarity index 100% rename from socorro/external/fs/filesystem.py rename to socorro/external/filesystem/filesystem.py diff --git a/socorro/external/filesystem/json_dump_storage.py b/socorro/external/filesystem/json_dump_storage.py new file mode 100644 index 0000000000..262f85c78c --- /dev/null +++ b/socorro/external/filesystem/json_dump_storage.py @@ -0,0 +1,542 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import errno +import os +import json +import collections + +import socorro.external.filesystem.dump_storage as socorro_dumpStorage +import socorro.external.filesystem.filesystem as socorro_fs +import socorro.lib.util as socorro_util +import socorro.lib.ooid as socorro_ooid + +from socorro.lib.datetimeutil import datetimeFromISOdateString + +from socorro.lib.datetimeutil import utc_now + + +class NoSuchUuidFound(Exception): + pass + + +#============================================================================== +class JsonDumpStorage(socorro_dumpStorage.DumpStorage): + """ + This class implements a file system storage scheme for the JSON and dump + files of the Socorro project. It create a tree with two branches: the name + branch and the date branch. + - The name branch consists of paths based on the first 8 characters of the + crash_id file name. It holds the two data files and a relative symbolic + link to the date branch directory associated with the particular + crash_id. see socorro.lib.ooid.py for details of date and depth + encoding within the crash_id + For the crash_id: 22adfb61-f75b-11dc-b6be-001322081225 + - the json file is stored as + %(root)s/%(daypart)s/name/22/ad/22adfb61-f75b-11dc-b6be-001322081225 + .json + - the dump file is stored as + %(root)s/name/22/ad/22adfb61-f75b-11dc-b6be-001322081225.dump + - the symbolic link is stored as + %(root)s/name/22/ad/22adfb61-f75b-11dc-b6be-001322081225 + and (see below) references + %(toDateFromName)s/date/2008/12/25/12/05/webhead01_0 + - The date branch consists of paths based on the year, month, day, hour, + minute-segment, webhead host name and a small sequence number. + For each crash_id, it holds a relative symbolic link referring to the + actual storage (name) directory holding the data for that crash_id. + For the crash_id above, submitted at 2008-12-25T12:05 from webhead01 + - the symbolic link is stored as + %(root)s/date/2008/09/30/12/05/webhead01_0/22adfb61-f75b-11dc-b6be- + 001322081225 and references %(toNameFromDate)s/name/22/ad/ + + Note: The symbolic links are relative, so they begin with some rounds of + '../'. This is to avoid issues that might arise from variously mounted nfs + volumes. If the layout changes, self.toNameFromDate and toDateFromName + must be changed to match, as well as a number of the private methods. + + Note: If so configured, the bottom nodes in the date path will be + %(webheadName)s_n for n in range(N) for some reasonable (5, perhaps) N. + Files are placed into these buckets in rotation. + """ + #-------------------------------------------------------------------------- + def __init__(self, root=".", osModule=os, **kwargs): + """ + Take note of our root directory and other necessities. + Yes, it is perfectly legal to call super(...).__init__() after doing + some other code. As long as you expect the behavior you get, anyway... + """ + kwargs.setdefault('minutesPerSlot', 1) + kwargs.setdefault('subSlotCount', 1) # that is: use xxx_0 every time + # by default + super(JsonDumpStorage, self).__init__( + root=root, + osModule=osModule, + **kwargs + ) + tmp = kwargs.get('cleanIndexDirectories', 'false') + self.cleanIndexDirectories = 'true' == tmp.lower() + self.jsonSuffix = kwargs.get('jsonSuffix', '.json') + if not self.jsonSuffix.startswith('.'): + self.jsonSuffix = ".%s" % (self.jsonSuffix) + self.dumpSuffix = kwargs.get('dumpSuffix', '.dump') + if not self.dumpSuffix.startswith('.'): + self.dumpSuffix = ".%s" % (self.dumpSuffix) + self.logger = kwargs.get('logger', socorro_util.FakeLogger()) + + #-------------------------------------------------------------------------- + def new_entry(self, + crash_id, + raw_crash, + dumps_dict, + webhead_host_name='webhead01', + timestamp=None): + if not isinstance(dumps_dict, collections.Mapping): + dumps_dict = {self.dump_field: dumps_dict} + + name_dir, date_dir = super(JsonDumpStorage, self).newEntry( + crash_id, + datetimeFromISOdateString(raw_crash['submitted_timestamp']), + webhead_host_name + ) + + raw_crash_pathname = os.path.join( + name_dir, + crash_id + self.jsonSuffix + ) + with open(raw_crash_pathname, "w") as rcf: + json.dump(raw_crash, rcf) + + for dump_name, dump in dumps_dict.iteritems(): + full_dump_name = self.dump_file_name(crash_id, dump_name) + dump_pathname = os.path.join( + name_dir, + full_dump_name + ) + with open(dump_pathname, "w") as dp: + dp.write(dump) + self.osModule.chmod(dump_pathname, self.dumpPermissions) + + name_depth = socorro_ooid.depthFromOoid(crash_id) + if not name_depth: + name_depth = 4 + rparts = [os.path.pardir, ] * (1 + name_depth) + rparts.append(self.dateName) + date_depth = 2 # .../hh/mm_slot... + if webhead_host_name and self.subSlotCount: + date_depth = 3 # .../webHeadName_slot + date_parts = date_dir.split(os.path.sep)[-date_depth:] + rparts.extend(date_parts) + self.osModule.symlink( + os.path.sep.join(rparts), + os.path.join(name_dir, crash_id) + ) + + #-------------------------------------------------------------------------- + def newEntry(self, crash_id, webheadHostName='webhead01', timestamp=None): + """ + Sets up the name and date storage directory branches for the given + crash_id. Creates any directories that it needs along the path to the + appropriate storage location. + Creates two relative symbolic links: the date branch link pointing to + the name directory holding the files; + the name branch link pointing to the date branch directory holding that + link. + Returns a 2-tuple containing files open for writing: (jsonfile, + dumpfile) + If self.dumpGID, then the file tree from root to and including the data + files are chown'd + If self.dumpPermissions, then chmod is called on the data files + """ + # note: after this call, dateDir already holds link to nameDir + nameDir, dateDir = super(JsonDumpStorage, self).newEntry( + crash_id, + timestamp, + webheadHostName + ) + df, jf = None, None + jname = os.path.join(nameDir, crash_id + self.jsonSuffix) + try: + jf = open(jname, 'w') + except IOError, x: + if 2 == x.errno: + nameDir = self.makeNameDir(crash_id, timestamp) # deliberately + # leave this dir behind if next line throws + jf = open(jname, 'w') + else: + raise x + try: + # Do all this in a try/finally block to unroll in case of error + self.osModule.chmod(jname, self.dumpPermissions) + dname = os.path.join(nameDir, crash_id + self.dumpSuffix) + df = open(dname, 'w') + self.osModule.chmod(dname, self.dumpPermissions) + nameDepth = socorro_ooid.depthFromOoid(crash_id) + if not nameDepth: + nameDepth = 4 + rparts = [os.path.pardir, ] * (1 + nameDepth) + rparts.append(self.dateName) + dateDepth = 2 # .../hh/mm_slot... + if webheadHostName and self.subSlotCount: + dateDepth = 3 # .../webHeadName_slot + dateParts = dateDir.split(os.path.sep)[-dateDepth:] + rparts.extend(dateParts) + self.osModule.symlink( + os.path.sep.join(rparts), + os.path.join(nameDir, crash_id) + ) + if self.dumpGID: + + def chown1(path): + self.osModule.chown(path, -1, self.dumpGID) + socorro_fs.visitPath( + self.root, + os.path.join(nameDir, crash_id + self.jsonSuffix), + chown1, + self.osModule + ) + self.osModule.chown( + os.path.join(nameDir, crash_id + self.dumpSuffix), + -1, + self.dumpGID + ) + # socorro_fs.visitPath(self.root, + # os.path.join(dateDir,crash_id), + # chown1 + # ) + finally: + if not jf or not df: + if jf: + jf.close() + if df: + df.close() + try: + self.osModule.unlink(os.path.join(dateDir, crash_id)) + except Exception: + pass # ok if not there + try: + self.osModule.unlink(os.path.join(nameDir, crash_id)) + except Exception: + pass # ok if not there + df, jf = None, None + return (jf, df) + + #-------------------------------------------------------------------------- + def getJson(self, crash_id): + """ + Returns an absolute pathname for the json file for a given crash_id. + Raises OSError if the file is missing + """ + self.logger.debug('getJson %s', crash_id) + fname = "%s%s" % (crash_id, self.jsonSuffix) + path, parts = self.lookupNamePath(crash_id) + if path: + fullPath = os.path.join(path, fname) + # self.osModule.stat is moderately faster than trying to open + # for reading + self.readableOrThrow(fullPath) + return fullPath + raise OSError(errno.ENOENT, 'No such file: %s%s' % (crash_id, fname)) + + #-------------------------------------------------------------------------- + def getDump(self, crash_id, name=None): + """ + Returns an absolute pathname for the dump file for a given crash_id. + Raises OSError if the file is missing + """ + fname = self.dump_file_name(crash_id, name) + path, parts = self.lookupNamePath(crash_id) + msg = ('%s not stored in "%s/.../%s" file tree' + % (crash_id, self.root, self.indexName)) + if path: + fullPath = os.path.join(path, fname) + msg = "No such file: %s" % (os.path.join(path, fname)) + # self.osModule.stat is moderately faster than trying to open + # for reading + self.readableOrThrow(fullPath) + return fullPath + raise OSError(errno.ENOENT, msg) + + #-------------------------------------------------------------------------- + def _dump_names_from_pathnames(self, pathnames): + dump_names = [] + for a_pathname in pathnames: + base_name = os.path.basename(a_pathname) + dump_name = base_name[37:-len(self.dumpSuffix)] + if not dump_name: + dump_name = self.dump_field + dump_names.append(dump_name) + return dump_names + + #-------------------------------------------------------------------------- + def get_dumps(self, crash_id): + """ + Returns a tuple of paths to dumps + """ + path, parts = self.lookupNamePath(crash_id) + dump_pathnames = [os.path.join(path, dumpfilename) + for dumpfilename in os.listdir(path) + if dumpfilename.startswith(crash_id) and + dumpfilename.endswith(self.dumpSuffix)] + if not dump_pathnames: + raise OSError(errno.ENOENT, 'no dumps for ' + crash_id) + dump_dict = dict(zip(self._dump_names_from_pathnames(dump_pathnames), + dump_pathnames)) + return dump_dict + + #-------------------------------------------------------------------------- + def markAsSeen(self, crash_id): + """ + Removes the links associated with the two data files for this crash_id, + thus marking them as seen. + Quietly returns if the crash_id has no associated links. + """ + namePath, parts = self.namePath(crash_id) + dpath = None + try: + dpath = os.path.join( + namePath, + self.osModule.readlink(os.path.join(namePath, crash_id)) + ) + self.osModule.unlink(os.path.join(dpath, crash_id)) + except OSError, e: + if 2 == e.errno: # no such file or directory + pass + else: + raise e + try: + self.osModule.unlink(os.path.join(namePath, crash_id)) + except OSError, e: + if 2 == e.errno: # no such file or directory + pass + else: + raise e + + #-------------------------------------------------------------------------- + def destructiveDateWalk(self): + """ + This function is a generator that yields all ooids found by walking the + date branch of the file system. Just before yielding a value, it + deletes both the links (from date to name and from name to date) + After visiting all the ooids in a given date branch, recursively + travels up, deleting any empty subdirectories. Since the file system + may be manipulated in a different thread, if no .json or .dump file + is found, the links are left, and we do not yield that crash_id + """ + + def handleLink(dir, name): + nameDir = self.namePath(name)[0] + if not self.osModule.path.isfile( + os.path.join(nameDir, name + self.jsonSuffix) + ): + #print ' handleLink 1' + return None + if not self.osModule.path.isfile( + os.path.join(nameDir, name + self.dumpSuffix) + ): + #print ' handleLink 2' + return None + if self.osModule.path.islink(os.path.join(nameDir, name)): + self.osModule.unlink(os.path.join(nameDir, name)) + self.osModule.unlink(os.path.join(dir, name)) + #print ' handleLink 3' + return name + #print ' handleLink off end' + dailyParts = [] + try: + dailyParts = self.osModule.listdir(self.root) + except OSError: + # If root doesn't exist, quietly do nothing, eh? + return + for daily in dailyParts: + #print 'daily: %s' % daily + for dir, dirs, files in self.osModule.walk( + os.sep.join((self.root, daily, self.dateName)) + ): + #print dir,dirs,files + if (os.path.split(dir)[0] == + os.path.split(self.datePath(utc_now())[0])): + #print 'skipping dir %s' % dir + #print 'because: %s == %s' % (os.path.split(dir)[0], + # os.path.split(self.datePath(utc_now())[0])) + continue + # the links are all to (relative) directories, so we need not + # look at files + for d in dirs: + #print 'dir ', d + if self.osModule.path.islink(os.path.join(dir, d)): + r = handleLink(dir, d) + #print ' r ', r + if r: + yield r + # after finishing a given directory... + socorro_fs.cleanEmptySubdirectories( + os.path.join(self.root, daily), + dir, + self.osModule + ) + + #-------------------------------------------------------------------------- + def remove(self, crash_id, timestamp=None): + """ + Removes all instances of the crash_id from the file system including + the json file, the dump file, and the two links if they still exist. + If it finds no trace of the crash_id: No links, no data files, it + raises a NoSuchUuidFound exception. + Attempts to remove root/daily/date subtree for empty levels above this + date + If self.cleanIndexDirectories, attempts to remove root/daily subtree, + for empty levels above this name storage + """ + namePath, nameParts = self.lookupNamePath(crash_id, timestamp) + something = 0 + if namePath: + try: + datePath = os.path.join( + namePath, + self.osModule.readlink(os.path.join(namePath, crash_id)) + ) + if (self.osModule.path.exists(datePath) + and self.osModule.path.isdir(datePath)): + # We have a date and name path + self._remove( + crash_id, + namePath, + nameParts, + os.path.abspath(datePath), + [] + ) + something += 1 + else: + raise OSError # just to get to the next block + except OSError: + datePath, dateParts = \ + self.lookupOoidInDatePath(timestamp, crash_id) + if datePath: + self._remove( + crash_id, + namePath, + nameParts, + os.path.abspath(datePath), + dateParts + ) + something += 1 + else: + #print crash_id, namePath, nameParts + self._remove(crash_id, namePath, nameParts, None, []) + something += 1 + else: + datePath, dateParts = self.lookupOoidInDatePath(timestamp, + crash_id) + if datePath: + try: + namePath = os.path.normpath( + self.osModule.readlink(os.path.join(datePath, crash_id)) + ) + except OSError: + pass + if namePath or datePath: + self._remove(crash_id, namePath, None, datePath, dateParts) + something += 1 + if not something: + self.logger.warning("%s was totally unknown", crash_id) + raise NoSuchUuidFound("no trace of %s was found" % crash_id) + + #-------------------------------------------------------------------------- + def _remove(self, crash_id, namePath, nameParts, datePath, dateParts): + seenCount = 0 + dailyPart = None + if nameParts: + dailyPart = nameParts[1] + elif namePath: + dailyPart = namePath.split(os.sep, 2)[1] + elif dateParts: + dailyPart = dateParts[1] + elif datePath: + dailyPart = datePath.split(os.sep, 2)[1] + if not dailyPart: + return + stopper = os.path.join(self.root, dailyPart) + # unlink on the name side first, thereby erasing any hope of removing + # relative paths from here... + if namePath: + #print "*****", namePath + #raw_crash_path = self.getJson(crash_id) + #with open(raw_crash_path) as crash_file: + #raw_crash = json.load(crash_file) + #dump_names = raw_crash.get('dump_names', [self.dump_field]) + #try: + #self.osModule.unlink(os.path.join(namePath, crash_id)) + #seenCount += 1 + #except: + #pass + files_list = [x for x in os.listdir(namePath) + if x.startswith(crash_id)] + for a_file_name in files_list: + try: + self.osModule.unlink(os.path.join(namePath, a_file_name)) + seenCount += 1 + except OSError: + self.logger.warning("%s wasn't found", a_file_name) + try: + self.osModule.unlink( + os.path.join(namePath, crash_id + self.jsonSuffix) + ) + seenCount += 1 + except OSError: + pass + if self.cleanIndexDirectories: + try: + socorro_fs.cleanEmptySubdirectories( + stopper, + namePath, + self.osModule + ) # clean out name side if possible + except OSError: + pass + # and the date directory + if datePath: + try: + self.osModule.unlink(os.path.join(datePath, crash_id)) + seenCount += 1 + except Exception: + pass + try: + socorro_fs.cleanEmptySubdirectories( + self.root, + datePath, + self.osModule + ) + except Exception: + pass + if not seenCount: + self.logger.warning("%s was totally unknown", crash_id) + raise NoSuchUuidFound("no trace of %s was found" % crash_id) + + #-------------------------------------------------------------------------- + def quickDelete(self, crash_id): + """deletes just the json and dump files without testing for the links. + This is only to be used after destructiveDateWalk that will have + already removed the symbolic links. """ + namePath, nameParts = self.lookupNamePath(crash_id) + for filename in self.osModule.listdir(namePath): + if filename.startswith(crash_id): + full_filename = os.path.join(namePath, filename) + try: + self.osModule.unlink(full_filename) + except Exception as x: + self.logger.debug( + 'cannot delete %s', + full_filename, + exc_info=True + ) + + #-------------------------------------------------------------------------- + def dump_file_name(self, crash_id, dump_name): + if dump_name == self.dump_field or dump_name is None: + return crash_id + self.dumpSuffix + else: + return "%s.%s%s" % (crash_id, + dump_name, + self.dumpSuffix) diff --git a/socorro/external/filesystem/processed_dump_storage.py b/socorro/external/filesystem/processed_dump_storage.py new file mode 100644 index 0000000000..139f6204ba --- /dev/null +++ b/socorro/external/filesystem/processed_dump_storage.py @@ -0,0 +1,142 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import gzip +import logging +import os +import json +import socorro.external.filesystem.dump_storage as socorro_dumpStorage +import socorro.lib.util as socorro_util + + +class ProcessedDumpStorage(socorro_dumpStorage.DumpStorage): + """ + This class, mirrored from JsonDumpStorage in March 2009, implements a + gzipped file system storage scheme for the 'cooked raw dump data' from + stackwalk_minidump. The file format is gzipped json, with default suffix + '.jsonz' + Files are located using a radix structure based on the ooid or uuid of the + data, a (nearly) unique + identifier assigned at time of collection. The ooid has three parts: + - The ooid prefix + - A suffix that encodes the date of assignment + - information about the appropriate depth of the radix tree (by default: + 4, but now always 2) + The storage is a file whose name is ooid.suffix, whose path is determined + by the ooid itself + + An additional 'date' branch is saved to facilitate finding files by date. + It holds paths like YYYY/mm/dd/HH/MM_n/ooid where MM is among ['00','05', + ..., '55'], n is a (small) digit and ooid is a symbolic link to the + directory in the name branch holding ooid.jsonz + """ + def __init__(self, root='.', **kwargs): + """ + Set up the basic conditions for storing gmpgz files. Possible kwargs + keys: + - 'indexName': The relative path to the top of the name storage tree + from root parameter. Default 'name' + - //deprecated// rootName: now is indexName + - 'dateName': The relative path to the top of the date storage tree + from root parameter. Default 'date' + - 'fileSuffix': The storage filename suffix. Default '.jsonz' + - 'gzipCompression': The level of compression to use. Default = 9 + - 'minutesPerSlot': The number of minutes in the lowest date directory + Default = 1 + - 'subSlotCount': If other than 1 (default) distribute data evenly + among this many sub timeslots + - 'dirPermissions': sets the permissions for all directories in name + and date paths. Default 'rwxrwx---' + - 'dumpPermissions': sets the permissions for actual stored files + (this class creates no files) + - 'dumpGID': sets the group ID for all directoies in name and date + paths. default None. + - 'logger': A logger. Default: logging.getLogger('dumpStorage') + - 'storageDepth': the length of branches in the radix storage tree. + Default = 2 + Do NOT change from 2 without updateing apache mod-rewrite rules + and IT old-file removal scripts + """ + kwargs.setdefault('minutesPerSlot', 1) + kwargs.setdefault('subSlotCount', 1) + rootName = kwargs.get('rootName', 'name') + kwargs.setdefault('indexName', rootName) + super(ProcessedDumpStorage, self).__init__(root=root, **kwargs) + self.fileSuffix = kwargs.get('fileSuffix', '.jsonz') + self.gzipCompression = int(kwargs.get('gzipCompression', 9)) + self.storageDepth = int(kwargs.get('storageDepth', 2)) + if not self.fileSuffix.startswith('.'): + self.fileSuffix = ".%s" % (self.fileSuffix) + self.logger = kwargs.get('logger', logging.getLogger('dumpStorage')) + + def newEntry(self, ooid, timestamp=None): + """ + Given a ooid, create an empty file and a writeable 'file' handle + (actually GzipFile) to it. Create the symbolic link from the date + branch to the file's storage directory + Returns the 'file' handle, or None if there was a problem + """ + nameDir, dateDir = \ + super(ProcessedDumpStorage, self).newEntry(ooid, timestamp) + dname = os.path.join(nameDir, ooid + self.fileSuffix) + df = None + try: + try: + df = gzip.open(dname, 'w', self.gzipCompression) + except IOError, x: + if 2 == x.errno: + # We might have lost this directory during a cleanup in + # another thread or process. Do again. + nameDir, nparts = self.makeNameDir(ooid, timestamp) + df = gzip.open(dname, 'w', self.gzipCompression) + else: + raise x + except Exception, x: + raise + os.chmod(dname, self.dumpPermissions) + finally: + if not df: + os.unlink(os.path.join(dateDir, ooid)) + return df + + def putDumpToFile(self, ooid, dumpObject, timestamp=None): + """Given a ooid and an dumpObject, create the appropriate dump file and + fill it with object's data""" + fh = self.newEntry(ooid, timestamp) + try: + json.dump(dumpObject, fh) + finally: + fh.close() + + def getDumpFromFile(self, ooid): + """Given a ooid, extract and return a dumpObject from the associated + file if possible. + raises OSError if the file is missing or unreadable + """ + df = None + try: + df = gzip.open(self.getDumpPath(ooid)) + return json.load(df, object_hook=socorro_util.DotDict) + finally: + if df: + df.close() + + def getDumpPath(self, ooid): + """ Return an absolute path for the file for a given ooid + Raise: OSError if the file is missing or unreadable""" + path = os.path.join(self.namePath(ooid)[0], ooid + self.fileSuffix) + self.readableOrThrow(path) + return path + + def removeDumpFile(self, ooid): + """ + Find and remove the dump file for the given ooid. + Quietly continue if unfound. Log problem and continue if irremovable. + """ + try: + filePath = self.getDumpPath(ooid) + os.unlink(filePath) + except OSError, x: + if 2 != x.errno: + socorro_util.reportExceptionAndContinue(self.logger) diff --git a/socorro/external/hbase/__init__.py b/socorro/external/hbase/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/socorro/external/hbase/connection_context.py b/socorro/external/hbase/connection_context.py new file mode 100644 index 0000000000..46e2c7c317 --- /dev/null +++ b/socorro/external/hbase/connection_context.py @@ -0,0 +1,220 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import threading +import contextlib + +from socorro.external.hbase import hbase_client +from configman.config_manager import RequiredConfig +from configman import Namespace + + +class HBaseSingleConnectionContext(RequiredConfig): + """a configman compliant class for setup of HBase connections + DO NOT SHARE HBASE CONNECTIONS BETWEEN THREADS + """ + #-------------------------------------------------------------------------- + # configman parameter definition section + # here we're setting up the minimal parameters required for connecting + required_config = Namespace() + required_config.add_option( + 'number_of_retries', + doc='Max. number of retries when fetching from hbaseClient', + default=0, + reference_value_from='resource.hbase' + ) + required_config.add_option( + 'hbase_host', + doc='Host to HBase server', + default='localhost', + reference_value_from='resource.hbase', + ) + required_config.add_option( + 'hbase_port', + doc='Port to HBase server', + default=9090, + reference_value_from='resource.hbase', + ) + required_config.add_option( + 'hbase_timeout', + doc='timeout in milliseconds for an HBase connection', + default=5000, + reference_value_from='resource.hbase', + ) + required_config.add_option( + 'temporary_file_system_storage_path', + doc='a local filesystem path where dumps temporarily ' + 'during processing', + default='/home/socorro/temp', + reference_value_from='resource.hbase', + ) + required_config.add_option( + 'dump_file_suffix', + doc='the suffix used to identify a dump file (for use in temp files)', + default='.dump', + reference_value_from='resource.hbase', + ) + + #-------------------------------------------------------------------------- + def __init__(self, config, local_config=None): + """Initialize the parts needed to start making database connections + + parameters: + config - the complete config for the app. If a real app, this + would be where a logger or other resources could be + found. + local_config - this is the namespace within the complete config + where the actual database parameters are found""" + super(HBaseSingleConnectionContext, self).__init__() + self.config = config + if local_config is None: + local_config = config + + dummy_connection = hbase_client.HBaseConnectionForCrashReports( + local_config.hbase_host, + local_config.hbase_port, + local_config.hbase_timeout, + logger=self.config.logger + ) + dummy_connection.close() + self.operational_exceptions = \ + dummy_connection.hbaseThriftExceptions + self.operational_exceptions += \ + (hbase_client.NoConnectionException,) + self.conditional_exceptions = () + + #-------------------------------------------------------------------------- + def connection(self, name_unused=None): + """return a new database connection + + parameters: + name_unused - optional named connections. Used by the + derived class + """ + #self.config.logger.debug('creating new HBase connection') + return hbase_client.HBaseConnectionForCrashReports( + self.config.hbase_host, + self.config.hbase_port, + self.config.hbase_timeout, + logger=self.config.logger + ) + + #-------------------------------------------------------------------------- + @contextlib.contextmanager + def __call__(self, name=None): + """returns a database connection wrapped in a contextmanager. + + The context manager will assure that the connection is closed but will + not try to commit or rollback lingering transactions. + + parameters: + name - an optional name for the database connection""" + conn = self.connection(name) + try: + #self.config.logger.debug('connection HBase acquired') + yield conn + finally: + self.close_connection(conn) + + #-------------------------------------------------------------------------- + def close_connection(self, connection, force=False): + """close the connection passed in. + + This function exists to allow derived classes to override the closing + behavior. + + parameters: + connection - the database connection object + force - unused boolean to force closure; used in derived classes + """ + #self.config.logger.debug('connection HBase closed') + connection.close() + + #-------------------------------------------------------------------------- + def close(self): + """close any pooled or cached connections. Since this base class + object does no caching, there is no implementation required. Derived + classes may implement it.""" + pass + + #-------------------------------------------------------------------------- + def is_operational_exception(self, msg): + """return True if a conditional exception is actually an operational + error. Return False if it's a genuine error that should probably be + raised and propagate up. + + Some conditional exceptions might be actually be some form of + operational exception "labelled" wrong by the psycopg2 code error + handler. + """ + + return False + + #-------------------------------------------------------------------------- + def force_reconnect(self): + pass + + +#============================================================================== +class HBaseConnectionContextPooled(HBaseSingleConnectionContext): + """a configman compliant class that pools HBase database connections""" + #-------------------------------------------------------------------------- + def __init__(self, config, local_config=None): + super(HBaseConnectionContextPooled, self).__init__(config, + local_config) + #self.config.logger.debug("HBaseConnectionContextPooled - " + # "setting up connection pool") + self.pool = {} + + #-------------------------------------------------------------------------- + def connection(self, name=None): + """return a named connection. + + This function will return a named connection by either finding one + in its pool by the name or creating a new one. If no name is given, + it will use the name of the current executing thread as the name of + the connection. + + parameters: + name - a name as a string + """ + if not name: + name = self.config.executor_identity() + if name in self.pool: + #self.config.logger.debug('connection: %s', name) + return self.pool[name] + self.pool[name] = \ + super(HBaseConnectionContextPooled, self).connection(name) + return self.pool[name] + + #-------------------------------------------------------------------------- + def close_connection(self, connection, force=False): + """overriding the baseclass function, this routine will decline to + close a connection at the end of a transaction context. This allows + for reuse of connections.""" + if force: + try: + (super(HBaseConnectionContextPooled, self) + .close_connection(connection, force)) + except self.operational_exceptions: + self.config.logger.error('HBaseConnectionContextPooled - ' + 'failed closing') + for name, conn in self.pool.iteritems(): + if conn is connection: + break + del self.pool[name] + + #-------------------------------------------------------------------------- + def close(self): + """close all pooled connections""" + self.config.logger.debug("HBasePooled - " + "shutting down connection pool") + for name, conn in self.pool.iteritems(): + conn.close() + self.config.logger.debug("HBasePooled - connection %s closed" + % name) + + #-------------------------------------------------------------------------- + def force_reconnect(self): + pass diff --git a/socorro/external/hbase/crash_data.py b/socorro/external/hbase/crash_data.py new file mode 100644 index 0000000000..7335ce40bf --- /dev/null +++ b/socorro/external/hbase/crash_data.py @@ -0,0 +1,13 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from socorro.external.crash_data_base import CrashDataBase + + +class CrashData(CrashDataBase): + """ + Implement the /crash_data service with HBase. + """ + def get_storage(self): + return self.config.hbase.hbase_class(self.config.hbase) diff --git a/socorro/external/hbase/crashstorage.py b/socorro/external/hbase/crashstorage.py new file mode 100644 index 0000000000..a087ab8840 --- /dev/null +++ b/socorro/external/hbase/crashstorage.py @@ -0,0 +1,177 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import datetime +import os + +from socorro.external.crashstorage_base import ( + CrashStorageBase, CrashIDNotFound) +from socorro.external.hbase import hbase_client +from socorro.database.transaction_executor import TransactionExecutor +from socorro.external.hbase.connection_context import \ + HBaseConnectionContextPooled +from socorro.lib.util import DotDict +from configman import Namespace, class_converter + + +#============================================================================== +class HBaseCrashStorage(CrashStorageBase): + + required_config = Namespace() + required_config.add_option( + 'transaction_executor_class', + default=TransactionExecutor, + doc='a class that will execute transactions', + from_string_converter=class_converter, + reference_value_from='resource.hbase', + ) + required_config.add_option( + 'hbase_connection_pool_class', + default=HBaseConnectionContextPooled, + doc='the class responsible for pooling and giving out HBase' + 'connections', + reference_value_from='resource.hbase', + ) + + #-------------------------------------------------------------------------- + def __init__(self, config, quit_check_callback=None): + super(HBaseCrashStorage, self).__init__(config, quit_check_callback) + + self.logger.info('connecting to hbase') + self.hbaseConnectionPool = config.hbase_connection_pool_class(config) + + self.transaction_executor = config.transaction_executor_class( + config, + self.hbaseConnectionPool, + self.quit_check + ) + + self.exceptions_eligible_for_retry += \ + self.hbaseConnectionPool.operational_exceptions + + #-------------------------------------------------------------------------- + def close(self): + self.hbaseConnectionPool.close() + + #-------------------------------------------------------------------------- + def save_raw_crash(self, raw_crash, dumps, crash_id): + # the transaction_executor will run the function given as the first + # parameter. To that function, the transaction_executor will pass + # self.hbaseConnection, crash_id, raw_crash, dump, and + # number_of_retries. + # notice that the function is an unbound method. Since + # self.hbaseConnection is passed in as the first parameter, that + # fills in the proper value for the function's 'self' parameter. + # warning: this breaks inheritance if a subclass of + # HBaseConnectionForCrashReports were desired instead. + self.transaction_executor( + hbase_client.HBaseConnectionForCrashReports.put_json_dump, + crash_id, + raw_crash, + dumps, + number_of_retries=self.config.number_of_retries + ) + + self.logger.info('saved - %s', crash_id) + + #-------------------------------------------------------------------------- + def save_processed(self, processed_crash): + self._stringify_dates_in_dict(processed_crash) + self.transaction_executor( + hbase_client.HBaseConnectionForCrashReports.put_processed_json, + processed_crash['uuid'], + processed_crash, + number_of_retries=self.config.number_of_retries + ) + + #-------------------------------------------------------------------------- + def save_raw_and_processed(self, raw_crash, dumps, processed_crash, crash_id): + """ bug 866973 - do not put raw_crash back into HBase again + We are doing this in lieu of a queuing solution that could allow + us to operate an independent crashmover. When the queuing system + is implemented, we could remove this, and have the raw crash + saved by a crashmover that's consuming crash_ids the same way + that the processor consumes them. + """ + self.save_processed(processed_crash) + + #-------------------------------------------------------------------------- + def get_raw_crash(self, crash_id): + return DotDict(self.transaction_executor( + hbase_client.HBaseConnectionForCrashReports.get_json, + crash_id, + number_of_retries=self.config.number_of_retries + )) + + #-------------------------------------------------------------------------- + def get_raw_dump(self, crash_id, name=None): + return self.transaction_executor( + hbase_client.HBaseConnectionForCrashReports.get_dump, + crash_id, + name, + number_of_retries=self.config.number_of_retries + ) + + #-------------------------------------------------------------------------- + def get_raw_dumps(self, crash_id): + return self.transaction_executor( + hbase_client.HBaseConnectionForCrashReports.get_dumps, + crash_id, + number_of_retries=self.config.number_of_retries + ) + #-------------------------------------------------------------------------- + def get_raw_dumps_as_files(self, crash_id): + """this method fetches a set of dumps from HBase and writes each one + to a temporary file. The pathname for the dump includes the string + 'TEMPORARY' as a signal to the processor that it has the responsibilty + to delete the file when it is done using it.""" + dumps_mapping = self.transaction_executor( + hbase_client.HBaseConnectionForCrashReports.get_dumps, + crash_id, + number_of_retries=self.config.number_of_retries + ) + name_to_pathname_mapping = {} + for name, dump in dumps_mapping.iteritems(): + dump_pathname = os.path.join( + self.config.temporary_file_system_storage_path, + "%s.%s.TEMPORARY%s" % (crash_id, + name, + self.config.dump_file_suffix) + ) + name_to_pathname_mapping[name] = dump_pathname + with open(dump_pathname, 'wb') as f: + f.write(dump) + return name_to_pathname_mapping + + #-------------------------------------------------------------------------- + def get_unredacted_processed(self, crash_id): + try: + return DotDict(self.transaction_executor( + hbase_client.HBaseConnectionForCrashReports.get_processed_json, + crash_id, + number_of_retries=self.config.number_of_retries + )) + except hbase_client.OoidNotFoundException: + # we want a consistent set of exceptions for the API + raise CrashIDNotFound(crash_id) + + #-------------------------------------------------------------------------- + def new_crashes(self): + connection = self.hbaseConnectionPool.connection() + return connection.iterator_for_all_legacy_to_be_processed() + + #-------------------------------------------------------------------------- + @staticmethod + def _stringify_dates_in_dict(a_dict): + for name, value in a_dict.iteritems(): + if isinstance(value, datetime.datetime): + a_dict[name] = ("%4d-%02d-%02d %02d:%02d:%02d.%d" % + (value.year, + value.month, + value.day, + value.hour, + value.minute, + value.second, + value.microsecond) + ) diff --git a/socorro/external/hbase/hbase_client.py b/socorro/external/hbase/hbase_client.py new file mode 100644 index 0000000000..a1a4ca84b3 --- /dev/null +++ b/socorro/external/hbase/hbase_client.py @@ -0,0 +1,1315 @@ +#!/usr/bin/python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import json +import itertools +import os +import gzip +import heapq +import time +import tarfile +import random +import sys +import contextlib + +import socket + +from thrift import Thrift # get module +from thrift.transport import TSocket, TTransport # get modules +from thrift.protocol import TBinaryProtocol # get module +from hbase import ttypes # get module +from hbase.Hbase import Client, ColumnDescriptor, Mutation # get classes + +import socorro.lib.util as utl +from socorro.lib.util import DotDict + + +class HBaseClientException(Exception): + pass + + +class BadOoidException(HBaseClientException): + def __init__(self, wrapped_exception_class, reason=''): + HBaseClientException.__init__( + self, + "Bad OOID: %s-%s" % (str(wrapped_exception_class), str(reason)) + ) + + +class OoidNotFoundException(HBaseClientException): + def __init__(self, reason=''): + HBaseClientException.__init__(self, "OOID not found: %s" % str(reason)) + + +class FatalException(HBaseClientException): + def __init__(self, wrapped_exception_class, reason=''): + HBaseClientException.__init__( + self, + "the connection is not viable. retries fail: %s" % str(reason) + ) + + +class NoConnectionException(FatalException): + def __init__(self, wrapped_exception_class, reason='', tries=0): + FatalException.__init__( + self, + NoConnectionException, + "No connection was made to HBase (%d tries): %s-%s" % ( + tries, + str(wrapped_exception_class), + str(reason) + ) + ) + + +class UnhandledInternalException(HBaseClientException): + def __init__(self, wrapped_exception_class, reason=''): + HBaseClientException.__init__( + self, + "An internal exception was not handled: %s-%s" % ( + str(wrapped_exception_class), + str(reason) + ) + ) + + +def exception_wrapper(xClass): + """This decorator ensures that no exception escapes that isn't from the + HBaseClientException hierarchy. Any unexpected exceptions are wrapped in + the exception class passed into this function. The original exception is + preserved as the text of the wrapping expression. Traceback info of the + original exception is also preserved as the traceback for the wrapping + exception.""" + def wrapper(fn): + def f(*args, **kwargs): + try: + result = fn(*args, **kwargs) + return result + except HBaseClientException: + raise + except Exception: + txClass, tx, txtb = sys.exc_info() + raise xClass, xClass(txClass, tx), txtb + f.__name__ = fn.__name__ + return f + return wrapper + + +def exception_wrapper_for_generators(xClass): + """This decorator ensures that no exception escapes that isn't from the + HBaseClientException hierarchy. Any unexpected exceptions are wrapped in + the exception class passed into this function. The original exception is + preserved as the text of the wrapping expression. Traceback info of the + original exception is also preserved as the traceback for the wrapping + exception.""" + def wrapper(fn): + def f(*args, **kwargs): + try: + for x in fn(*args, **kwargs): + yield x + except HBaseClientException, x: + raise + except Exception, x: + txClass, tx, txtb = sys.exc_info() + raise xClass, xClass(txClass, tx), txtb + f.__name__ = fn.__name__ + return f + return wrapper + + +def retry_wrapper_for_generators(fn): + """a decorator to add retry symantics to any generator that uses hbase. + Don't wrap iterators that themselves wrap iterators. In other words, don't + nest these.""" + def f(self, *args, **kwargs): + self.logger.debug( + 'retry_wrapper_for_generators: trying first time, %s', + fn.__name__ + ) + fail_counter = 0 + while True: # we have to loop forever, we don't know the length of the + # wrapped iterator + try: + for x in fn(self, *args, **kwargs): + fail_counter = 0 + yield x + self.logger.debug( + 'retry_wrapper_for_generators: completed without ' + 'trouble, %s', + fn.__name__ + ) + break # this is the sucessful exit from this function + except self.hbaseThriftExceptions, x: + self.logger.debug( + 'retry_wrapper_for_generators: handled exception %s', + str(x) + ) + fail_counter += 1 + if fail_counter > 1: + self.logger.error( + 'retry_wrapper_for_generators: failed too many ' + 'times on this one operation, %s', + fn.__name__ + ) + txClass, tx, txtb = sys.exc_info() + raise FatalException, FatalException(tx), txtb + try: + self.close() + except self.hbaseThriftExceptions: + pass + self.logger.debug( + 'retry_wrapper_for_generators: about to retry connection' + ) + self.make_connection(timeout=self.timeout) + self.logger.debug( + 'retry_wrapper_for_generators: about to retry function, %s', + fn.__name__ + ) + except Exception, x: + self.logger.debug( + 'retry_wrapper_for_generators: unhandled exception, %s', + str(x) + ) + raise + f.__name__ = fn.__name__ + return f + + +def optional_retry_wrapper(fn): + """a decorator to add retry symantics to any method that uses hbase""" + def f(self, *args, **kwargs): + number_of_retries = kwargs.setdefault('number_of_retries', 1) + del kwargs['number_of_retries'] + wait_between_retries = kwargs.setdefault('wait_between_retries', 6) + del kwargs['wait_between_retries'] + countdown = number_of_retries + 1 + while countdown: + countdown -= 1 + try: + result = fn(self, *args, **kwargs) + return result + # drop and remake connection + except self.hbaseThriftExceptions, x: + self.logger.debug( + 'hbase client retry_wrapper: handled exception, %s', + str(x) + ) + if not countdown: + # we've gone through all the retries that we're allowed + txClass, tx, txtb = sys.exc_info() + raise FatalException, FatalException(tx), txtb + try: + self.close() + except self.hbaseThriftExceptions: + pass + self.logger.debug('hbase client retry_wrapper: about to retry connection') + self.make_connection(timeout=self.timeout) + # unknown error - abort + except Exception, x: + self.logger.debug( + 'hbase client retry_wrapper: unhandled exception, %s', + str(x) + ) + raise + if wait_between_retries: + time.sleep(wait_between_retries) + f.__name__ = fn.__name__ + return f + + +@exception_wrapper(BadOoidException) +def guid_to_timestamped_row_id(id, timestamp): + """ + Returns a row_id suitable for the HBase crash_reports index tables. + The first hex character of the ooid is used to "salt" the rowkey + so that there should always be 16 HBase RegionServers responsible + for dealing with the current stream of data. + Then, we put the crash_report submission timestamp. This lets us + easily scan through a time specific region of the index. + Finally, we append the normal ooid string for uniqueness. + """ + if timestamp[-6] in "-+": + return "%s%s%s" % (id[0], timestamp[:-6], id) + return "%s%s%s" % (id[0], timestamp, id) + + +@exception_wrapper(BadOoidException) +def ooid_to_row_id(ooid, old_format=False): + """ + Returns a row_id suitable for the HBase crash_reports table. + The first hex character of the ooid is used to "salt" the rowkey + so that there should always be 16 HBase RegionServers responsible + for dealing with the current stream of data. + Then, we put the last six digits of the ooid which represent the + submission date. This lets us easily scan through the crash_reports + table by day. + Finally, we append the normal ooid string. + """ + try: + if old_format: + return "%s%s" % (ooid[-6:], ooid) + else: + return "%s%s%s" % (ooid[0], ooid[-6:], ooid) + except Exception, x: + raise BadOoidException(x) + + +@exception_wrapper(BadOoidException) +def row_id_to_ooid(row_id): + """ + Returns the natural ooid given an HBase row key. + See ooid_to_row_id for structure of row_id. + """ + try: + return row_id[7:] + except Exception, x: + raise BadOoidException(x) + + +class HBaseConnection(object): + """ + Base class for hbase connections. Supplies methods for a few basic + queries and methods for cleanup of thrift results. + """ + def __init__(self, host, port, timeout, + thrift=Thrift, + tsocket=TSocket, + ttrans=TTransport, + protocol=TBinaryProtocol, + ttp=ttypes, + client=Client, + column=ColumnDescriptor, + mutation=Mutation, + logger=utl.SilentFakeLogger()): + self.host = host + self.port = port + self.timeout = timeout + self.thriftModule = thrift + self.tsocketModule = tsocket + self.transportModule = ttrans + self.protocolModule = protocol + self.ttypesModule = ttp + self.clientClass = client + self.columnClass = column + self.mutationClass = mutation + self.logger = logger + self.hbaseThriftExceptions = (self.ttypesModule.IOError, + #self.ttypesModule.IllegalArgument, + #self.ttypesModule.AlreadyExists, + self.thriftModule.TException, + #HBaseClientException, + socket.timeout, + socket.error, + FatalException + ) + self.operational_exceptions = self.hbaseThriftExceptions + self.conditional_exceptions = () + + try: + self.make_connection(timeout=self.timeout) + except NoConnectionException: + utl.reportExceptionAndContinue(logger=self.logger) + + def make_connection(self, retry=2, timeout=9000): + """Establishes the underlying connection to hbase""" + try: + self.logger.debug('make_connection, timeout = %d', timeout) + except Exception: + pass + count = retry + while count: + try: + count -= 1 + # Make socket + transport = self.tsocketModule.TSocket(self.host, self.port) + transport.setTimeout(timeout) # in ms + # Buffering is critical. Raw sockets are very slow + self.transport = \ + self.transportModule.TBufferedTransport(transport) + # Wrap in a protocol + self.protocol = \ + self.protocolModule.TBinaryProtocol(self.transport) + # Create a client to use the protocol encoder + self.client = self.clientClass(self.protocol) + # Connect! + self.transport.open() + self.badConnection = False + self.logger.debug('connection successful') + return + except self.hbaseThriftExceptions, x: + self.logger.debug('connection fails: %s', str(x)) + self.badConnection = True + pass + exceptionType, exception, tracebackInfo = sys.exc_info() + raise NoConnectionException, \ + NoConnectionException(exceptionType, exception, retry), \ + tracebackInfo + + def close(self): + """Close the hbase connection""" + self.transport.close() + + def _make_rows_nice(self, client_result_object): + """Apply _make_row_nice to multiple rows""" + res = [self._make_row_nice(row) for row in client_result_object] + return res + + def _make_row_nice(self, client_row_object): + """Pull out the contents of the thrift column result objects into a + python dict""" + return dict( + ((x, y.value) for x, y in client_row_object.columns.items()) + ) + + @optional_retry_wrapper + def describe_table(self, table_name): + return self.client.getColumnDescriptors(table_name) + + @optional_retry_wrapper + def get_full_row(self, table_name, row_id): + """Get back every column value for a specific row_id""" + return self._make_rows_nice(self.client.getRow(table_name, row_id)) + + def commit(self): + pass + + def rollback(self): + pass + + def is_operational_exception(self, msg): + return True + + +class HBaseConnectionForCrashReports(HBaseConnection): + """A subclass of the HBaseConnection class providing more crash report + specific methods""" + def __init__(self, + host, + port, + timeout, + thrift=Thrift, + tsocket=TSocket, + ttrans=TTransport, + protocol=TBinaryProtocol, + ttp=ttypes, + client=Client, + column=ColumnDescriptor, + mutation=Mutation, + logger=utl.SilentFakeLogger()): + super(HBaseConnectionForCrashReports, self).__init__( + host, + port, + timeout, + thrift, + tsocket, + ttrans, + protocol, + ttp, + client, + column, + mutation, + logger + ) + + def _make_row_nice(self, client_row_object): + """This method allows the CrashReports subclass to output an additional + column called ooid which does not have the HBase row_key prefixing junk + in the way.""" + columns = super(HBaseConnectionForCrashReports, self)._make_row_nice( + client_row_object + ) + columns['_rowkey'] = client_row_object.row + return columns + + @optional_retry_wrapper + def get_json_meta_as_string(self, ooid, old_format=False): + """ + Return the json metadata for a given ooid as an unexpanded string. + If the ooid doesn't exist, raise not found. + """ + row_id = ooid_to_row_id(ooid, old_format) + listOfRawRows = self.client.getRowWithColumns( + 'crash_reports', + row_id, + ['meta_data:json'] + ) + try: + if listOfRawRows: + return listOfRawRows[0].columns["meta_data:json"].value + else: + raise OoidNotFoundException("%s - %s" % (ooid, row_id)) + except KeyError: + self.logger.debug( + 'key error trying to get "meta_data:json" for %s', + ooid + ) + raise + + #@optional_retry_wrapper + def get_json(self, ooid, old_format=False, number_of_retries=2): + """Return the json metadata for a given ooid as an json data object""" + jsonColumnOfRow = self.get_json_meta_as_string( + ooid, + old_format, + number_of_retries=number_of_retries + ) + json_data = json.loads(jsonColumnOfRow, object_hook=DotDict) + return json_data + + @optional_retry_wrapper + def get_dump(self, ooid, name=None): + """Return the minidump for a given ooid as a string of bytes + If the ooid doesn't exist, raise not found""" + if name in (None, '', 'upload_file_minidump'): + name = 'dump' + column_family_and_qualifier = 'raw_data:%s' % name + row_id = ooid_to_row_id(ooid) + listOfRawRows = self.client.getRowWithColumns( + 'crash_reports', + row_id, + [column_family_and_qualifier] + ) + try: + if listOfRawRows: + return listOfRawRows[0].columns[column_family_and_qualifier].value + else: + raise OoidNotFoundException(ooid) + except KeyError: + self.logger.debug( + 'key error trying to get "%s" for %s', + (column_family_and_qualifier, ooid) + ) + raise + + @staticmethod + def _make_dump_name(family_qualifier): + name = family_qualifier.split(':')[1] + if name == 'dump': + name = 'upload_file_minidump' + return name + + @optional_retry_wrapper + def get_dumps(self, ooid): + """Return the minidump for a given ooid as a string of bytes + If the ooid doesn't exist, raise not found""" + row_id = ooid_to_row_id(ooid) + listOfRawRows = self.client.getRowWithColumns( + 'crash_reports', + row_id, + ['raw_data'] + ) + try: + if listOfRawRows: + column_mapping = listOfRawRows[0].columns + d = dict([ + (self._make_dump_name(k), v.value) + for k, v in column_mapping.iteritems()]) + return d + else: + raise OoidNotFoundException(ooid) + except KeyError: + self.logger.debug( + 'key error trying to get "raw_data" from %s', + ooid + ) + raise + + @optional_retry_wrapper + def get_raw_report(self, ooid): + """Return the json and dump for a given ooid + If the ooid doesn't exist, raise not found""" + row_id = ooid_to_row_id(ooid) + listOfRawRows = self.client.getRowWithColumns( + 'crash_reports', + row_id, + ['meta_data:json', 'raw_data:dump'] + ) + if listOfRawRows: + return self._make_row_nice(listOfRawRows[0]) + else: + raise OoidNotFoundException(ooid) + + @optional_retry_wrapper + def get_processed_json_as_string(self, ooid): + """Return the cooked json (jsonz) for a given ooid as a string + If the ooid doesn't exist, return an empty string.""" + row_id = ooid_to_row_id(ooid) + listOfRawRows = self.client.getRowWithColumns( + 'crash_reports', + row_id, + ['processed_data:json'] + ) + if listOfRawRows: + return listOfRawRows[0].columns["processed_data:json"].value + else: + raise OoidNotFoundException(ooid) + + #@optional_retry_wrapper + def get_processed_json(self, ooid, number_of_retries=2): + """Return the cooked json (jsonz) for a given ooid as a json object + If the ooid doesn't exist, return an empty string.""" + jsonColumnOfRow = self.get_processed_json_as_string( + ooid, + number_of_retries=number_of_retries + ) + json_data = json.loads(jsonColumnOfRow, object_hook=DotDict) + return json_data + + @optional_retry_wrapper + def get_report_processing_state(self, ooid): + """Return the current state of processing for this report and the + submitted_timestamp needed. For processing queue manipulation. + If the ooid doesn't exist, return an empty array""" + row_id = ooid_to_row_id(ooid) + listOfRawRows = self.client.getRowWithColumns( + 'crash_reports', + row_id, + ['flags:processed', + 'flags:legacy_processing', + 'timestamps:submitted', + 'timestamps:processed' + ] + ) + if listOfRawRows: + return self._make_row_nice(listOfRawRows[0]) + else: + raise OoidNotFoundException(ooid) + + @optional_retry_wrapper + def get_list_of_processed_json_for_date(self, date): + """Iterates through all rows for a given date and returns the list of + all crash ids. The implementation opens up 16 scanners (one for each + leading hex character of the salt) one at a time and returns all of + the rows matching""" + for row in self.union_scan_with_prefix( + 'crash_reports', + date, + ['processed_data:json'] + ): + yield row['processed_data:json'] + + def export_jsonz_for_date(self, date, path): + """Iterates through all rows for a given date and dumps the + processed_data:json out as a jsonz file. The implementation opens up + 16 scanners (one for each leading hex character of the salt) + one at a time and returns all of the rows matching""" + for row in self.limited_iteration( + self.union_scan_with_prefix( + 'crash_reports', + date, + ['processed_data:json'] + ), + 10 + ): + ooid = row_id_to_ooid(row['_rowkey']) + if row['processed_data:json']: + file_name = os.path.join(path, ooid + '.jsonz') + file_handle = gzip.open(file_name, 'w', 9) + try: + json.dump(row['processed_data:json'], file_handle) + finally: + file_handle.close() + + def export_jsonz_tarball_for_date(self, date, path, tarball_name): + """ + Iterates through all rows for a given date and dumps the + processed_data:json out as a jsonz file. The implementation opens up + 16 scanners (one for each leading hex character of the salt) + one at a time and returns all of the rows matching + """ + tf = tarfile.open(tarball_name, 'w:gz') + try: + for i, row in enumerate( + self.limited_iteration( + self.union_scan_with_prefix( + 'crash_reports', + date, + ['processed_data:json'] + ), + 10) + ): + ooid = row_id_to_ooid(row['_rowkey']) + if row['processed_data:json']: + file_name = os.path.join(path, ooid + '.jsonz') + file_handle = gzip.open(file_name, 'w', 9) + try: + json.dump(row['processed_data:json'], file_handle) + finally: + file_handle.close() + tf.add( + file_name, + os.path.join(ooid[:2], ooid[2:4], ooid + '.jsonz') + ) + os.unlink(file_name) + finally: + tf.close() + + def export_jsonz_tarball_for_ooids(self, path, tarball_name): + """Creates jsonz files for each ooid passed in on stdin and puts them + all in a tarball""" + tf = tarfile.open(tarball_name, 'w') + try: + for line in sys.stdin.readlines(): + ooid = line.strip() + self.logger.debug('Ooid: "%s"', ooid) + if len(ooid) == 36: + try: + json = self.get_processed_json_as_string(ooid) + except OoidNotFoundException: + self.logger.debug( + 'OoidNotFound (No processed_data:json?): %s', + ooid + ) + continue + file_name = os.path.join(path, ooid + '.jsonz') + file_handle = gzip.open(file_name, 'w', 9) + try: + file_handle.write(json) + finally: + file_handle.close() + tf.add(file_name, os.path.join( + ooid[:2], + ooid[2:4], + ooid + '.jsonz' + )) + os.unlink(file_name) + else: + self.logger.debug('Skipping...') + finally: + tf.close() + + def union_scan_with_prefix(self, table, prefix, columns): + # TODO: Need assertion for columns contains at least 1 element + """A lazy chain of iterators that yields unordered rows starting with + a given prefix. The implementation opens up 16 scanners (one for each + leading hex character of the salt) one at a time and returns all of + the rows matching""" + for salt in '0123456789abcdef': + salted_prefix = "%s%s" % (salt, prefix) + scanner = self.client.scannerOpenWithPrefix( + table, + salted_prefix, + columns + ) + for rowkey, row in salted_scanner_iterable( + self.logger, + self.client, + self._make_row_nice, + salted_prefix, + scanner + ): + yield row + + def merge_scan_with_prefix(self, table, prefix, columns): + # TODO: Need assertion that columns is array containing at least + # one string + """A generator based iterator that yields totally ordered rows starting + with a given prefix. The implementation opens up 16 scanners (one for + each leading hex character of the salt) simultaneously and then yields + the next row in order from the pool on each iteration.""" + iterators = [] + next_items_queue = [] + for salt in '0123456789abcdef': + salted_prefix = "%s%s" % (salt, prefix) + scanner = self.client.scannerOpenWithPrefix( + table, + salted_prefix, + columns + ) + iterators.append(salted_scanner_iterable( + self.logger, + self.client, + self._make_row_nice, + salted_prefix, + scanner + )) + # The i below is so we can advance whichever scanner delivers us the + # polled item. + for i, it in enumerate(iterators): + try: + next = it.next + next_items_queue.append([next(), i, next]) + except StopIteration: + pass + heapq.heapify(next_items_queue) + + while 1: + try: + while 1: + row_tuple, iter_index, next = s = next_items_queue[0] + # tuple[1] is the actual nice row. + yield row_tuple[1] + s[0] = next() + heapq.heapreplace(next_items_queue, s) + except StopIteration: + heapq.heappop(next_items_queue) + except IndexError: + return + + def limited_iteration(self, iterable, limit=10 ** 6): + self.logger.debug('limit = %d' % limit) + return itertools.islice(iterable, limit) + + @retry_wrapper_for_generators + def iterator_for_all_legacy_to_be_processed(self): + self.logger.debug('iterator_for_all_legacy_to_be_processed') + for row in self.limited_iteration( + self.merge_scan_with_prefix( + 'crash_reports_index_legacy_unprocessed_flag', + '', + ['ids:ooid'] + ) + ): + self.delete_from_legacy_processing_index(row['_rowkey']) + yield row['ids:ooid'] + + #@retry_wrapper_for_generators + @optional_retry_wrapper + def acknowledge_ooid_as_legacy_priority_job(self, ooid): + try: + state = self.get_report_processing_state(ooid) + if state: + row_key = guid_to_timestamped_row_id( + ooid, + state['timestamps:submitted'] + ) + self.delete_from_legacy_processing_index(row_key) + return bool(state) + except OoidNotFoundException: + return False + + @optional_retry_wrapper + def delete_from_legacy_processing_index(self, index_row_key): + self.client.deleteAllRow( + 'crash_reports_index_legacy_unprocessed_flag', + index_row_key + ) + self.client.atomicIncrement( + 'metrics', + 'crash_report_queue', + 'counters:current_legacy_unprocessed_size', + -1 + ) + + @optional_retry_wrapper + def put_crash_report_indices(self, ooid, timestamp, indices): + row_id = guid_to_timestamped_row_id(ooid, timestamp) + for index_name in indices: + self.client.mutateRow( + index_name, + row_id, + [self.mutationClass(column="ids:ooid", value=ooid)] + ) + + @optional_retry_wrapper + def put_crash_report_hang_indices(self, ooid, hang_id, process_type, + timestamp): + ooid_column_name = "ids:ooid:" + process_type + self.client.mutateRow( + 'crash_reports_index_hang_id_submitted_time', + guid_to_timestamped_row_id(hang_id, timestamp), + [self.mutationClass(column=ooid_column_name, value=ooid)] + ) + self.client.mutateRow( + 'crash_reports_index_hang_id', + hang_id, + [self.mutationClass(column=ooid_column_name, value=ooid)] + ) + + @optional_retry_wrapper + def update_metrics_counters_for_submit(self, + submitted_timestamp, + legacy_processing, + process_type, + is_hang, + add_to_unprocessed_queue): + """Increments a series of counters in the 'metrics' table related to + CR submission""" + timeLevels = [ + submitted_timestamp[:16], # minute yyyy-mm-ddTHH:MM + submitted_timestamp[:13], # hour yyyy-mm-ddTHH + submitted_timestamp[:10], # day yyyy-mm-dd + submitted_timestamp[: 7], # month yyyy-mm + submitted_timestamp[: 4] # year yyyy + ] + counterIncrementList = ['counters:submitted_crash_reports'] + counterIncrementList.append( + "counters:submitted_crash_reports_legacy_throttle_%d" + % legacy_processing + ) + if process_type != 'default': + if is_hang: + counterIncrementList.append( + "counters:submitted_crash_report_hang_pairs" + ) + else: + counterIncrementList.append( + "counters:submitted_oop_%s_crash_reports" % process_type + ) + + if add_to_unprocessed_queue: + self.client.atomicIncrement( + 'metrics', + 'crash_report_queue', + 'counters:current_unprocessed_size', + 1 + ) + if legacy_processing == 0: + self.client.atomicIncrement( + 'metrics', + 'crash_report_queue', + 'counters:current_legacy_unprocessed_size', + 1 + ) + + for rowkey in timeLevels: + for column in counterIncrementList: + self.client.atomicIncrement('metrics', rowkey, column, 1) + + @optional_retry_wrapper + def put_json_dump(self, ooid, json_data, dumps, + add_to_unprocessed_queue=True): + """Create a crash report record in hbase from serialized json and + bytes of the minidump""" + row_id = ooid_to_row_id(ooid) + submitted_timestamp = json_data['submitted_timestamp'] + json_string = json.dumps(json_data) + + # Extract ACCEPT(0), DEFER(1), DISCARD(2) enum or 0 if not found. + legacy_processing = json_data.get('legacy_processing', 0) + + columns = [ + ("flags:processed", "N"), + ("meta_data:json", json_string), + ("timestamps:submitted", submitted_timestamp), + ("ids:ooid", ooid), + #("raw_data:dump", dump) + ] + for key, dump in dumps.iteritems(): + if key in (None, '', 'upload_file_minidump'): + key = 'dump' + columns.append(('raw_data:%s' % key, dump)) + mutationList = [ + self.mutationClass(column=c, value=v) + for c, v in columns if v is not None + ] + + indices = [ + 'crash_reports_index_submitted_time', + 'crash_reports_index_unprocessed_flag' + ] + + if legacy_processing == 0: + mutationList.append( + self.mutationClass(column="flags:legacy_processing", value='Y') + ) + indices.append('crash_reports_index_legacy_unprocessed_flag') + indices.append('crash_reports_index_legacy_submitted_time') + + # Use ProcessType value if exists, otherwise, default (i.e. a + # standard application crash report) + process_type = json_data.get('ProcessType', 'default') + + is_hang = 'HangID' in json_data + if is_hang: + hang_id = json_data['HangID'] + mutationList.append( + self.mutationClass(column="ids:hang", value=hang_id) + ) + + # unit test marker 233 + self.client.mutateRow('crash_reports', row_id, mutationList) + self.put_crash_report_indices(ooid, submitted_timestamp, indices) + if is_hang: + self.put_crash_report_hang_indices( + ooid, + hang_id, + process_type, + submitted_timestamp + ) + + self.update_metrics_counters_for_submit( + submitted_timestamp, + legacy_processing, + process_type, + is_hang, + add_to_unprocessed_queue + ) + + def put_json_dump_from_files(self, ooid, json_path, dump_path, + openFn=open): + """Convenience method for creating an ooid from disk""" + json_file = open(json_path, 'r') + try: + json = json_file.read() + finally: + json_file.close() + # Apparently binary mode only matters in windows, but it won't hurt + # anything on unix systems. + dump_file = open(dump_path, 'rb') + try: + dump = dump_file.read() + finally: + dump_file.close() + self.put_json_dump(ooid, json, dump) + + @optional_retry_wrapper + def put_fixed_dump(self, ooid, dump, submitted_timestamp, + add_to_unprocessed_queue=True): + """Update a crash report with a new dump file optionally queuing for + processing""" + row_id = ooid_to_row_id(ooid) + + columns = [ + ("raw_data:dump", dump) + ] + mutationList = [ + self.mutationClass(column=c, value=v) + for c, v in columns if v is not None + ] + + indices = [] + + if add_to_unprocessed_queue: + indices.append('crash_reports_index_legacy_unprocessed_flag') + + self.client.mutateRow('crash_reports', row_id, mutationList) + + self.put_crash_report_indices(ooid, submitted_timestamp, indices) + + @optional_retry_wrapper + def put_processed_json(self, ooid, processed_json): + """ + Create a crash report from the cooked json output of the processor + """ + row_id = ooid_to_row_id(ooid) + + processing_state = self.get_report_processing_state(ooid) + submitted_timestamp = processing_state.get( + 'timestamps:submitted', + processed_json.get('date_processed', 'unknown') + ) + + if 'N' == processing_state.get('flags:processed', '?'): + index_row_key = guid_to_timestamped_row_id( + ooid, + submitted_timestamp + ) + self.client.atomicIncrement( + 'metrics', + 'crash_report_queue', + 'counters:current_unprocessed_size', + -1 + ) + self.client.deleteAllRow( + 'crash_reports_index_unprocessed_flag', + index_row_key + ) + + processed_timestamp = processed_json['completeddatetime'] + + if 'signature' in processed_json: + if len(processed_json['signature']) > 0: + signature = processed_json['signature'] + else: + signature = '##empty##' + else: + signature = '##null##' + + mutationList = [] + mutationList.append( + self.mutationClass( + column="timestamps:processed", + value=processed_timestamp + ) + ) + mutationList.append( + self.mutationClass( + column="processed_data:signature", + value=signature + ) + ) + mutationList.append( + self.mutationClass( + column="processed_data:json", + value=json.dumps(processed_json) + ) + ) + mutationList.append( + self.mutationClass( + column="flags:processed", + value="Y" + ) + ) + + self.client.mutateRow('crash_reports', row_id, mutationList) + + sig_ooid_idx_row_key = signature + ooid + self.client.mutateRow( + 'crash_reports_index_signature_ooid', + sig_ooid_idx_row_key, + [self.mutationClass(column="ids:ooid", value=ooid)] + ) + + def export_sampled_crashes_tarball_for_dates(self, sample_size, dates, + path, tarball_name): + """Iterates through all rows for given dates and dumps json and dump + for N random crashes. The implementation opens up 16 scanners (one for + each leading hex character of the salt) one at a time and returns all + of the rows randomly selected using a resovoir sampling algorithm""" + sample_size = int(sample_size) + dates = str.split(dates, ',') + + def gen(): + """generate all rows for given dates""" + for date in dates: + for id_row in self.union_scan_with_prefix( + 'crash_reports', + date, + ['ids:ooid'] + ): + yield id_row['ids:ooid'] + row_gen = gen() # start the generator + # get inital sample + ooids_to_export = [ + x for i, x in itertools.izip(xrange(sample_size), row_gen) + ] + # cycle through remaining rows + for i, ooid in enumerate(row_gen): + # Randomly replace elements with decreasing probability + rand = random.randrange(i + sample_size) + if rand < sample_size: + ooids_to_export[rand] = ooid + + # open output tar file + tf = tarfile.open(tarball_name, 'w:gz') + try: + for ooid in ooids_to_export: + json_file_name = os.path.join(path, ooid + '.json') + dump_file_name = os.path.join(path, ooid + '.dump') + row = self.get_raw_report(ooid) + json_file_handle = open(json_file_name, 'w') + try: + json_file_handle.write(row['meta_data:json']) + finally: + json_file_handle.close() + dump_file_handle = open(dump_file_name, 'w') + try: + dump_file_handle.write(row['raw_data:dump']) + finally: + dump_file_handle.close() + tf.add(json_file_name, os.path.join( + ooid[:2], + ooid[2:4], + ooid + '.json') + ) + tf.add( + dump_file_name, + os.path.join(ooid[:2], ooid[2:4], ooid + '.dump') + ) + os.unlink(json_file_name) + os.unlink(dump_file_name) + finally: + tf.close() + + +def salted_scanner_iterable(logger, client, make_row_nice, salted_prefix, + scanner): + """Generator based iterable that runs over an HBase scanner + yields a tuple of the un-salted rowkey and the nice format of the row.""" + logger.debug('Scanner %s generated', salted_prefix) + raw_rows = client.scannerGet(scanner) + while raw_rows: + nice_row = make_row_nice(raw_rows[0]) + #logger.debug('Scanner %s returning nice_row (%s) for raw_rows (%s)' % + #(self.salted_prefix,nice_row,raw_rows)) + yield (nice_row['_rowkey'][1:], nice_row) + raw_rows = client.scannerGet(scanner) + logger.debug('Scanner %s exhausted' % salted_prefix) + client.scannerClose(scanner) + +# TODO: Warning, the command line methods haven't been tested for bitrot +if __name__ == "__main__": + import pprint + + def ppjson(data, sort_keys=False, indent=4): + print json.dumps(data, sort_keys, indent) + + def usage(): + print """ + Usage: %s [-h host[:port]] command [arg1 [arg2...]] + + Commands: + Crash Report specific: + get_report ooid + get_json ooid + get_dump ooid + get_processed_json ooid + get_report_processing_state ooid + union_scan_with_prefix table prefix columns [limit] + merge_scan_with_prefix table prefix columns [limit] + put_json_dump ooid json dump + put_json_dump_from_files ooid json_path dump_path + export_jsonz_for_date YYMMDD export_path + export_jsonz_tarball_for_date YYMMDD temp_path tarball_name + export_jsonz_tarball_for_ooids temp_path tarball_name + export_sampled_crashes_tarball_for_dates sample_size + YYMMDD,YYMMDD,... path tarball_name + HBase generic: + describe_table table_name + get_full_row table_name row_id + """ % sys.argv[0] + + if len(sys.argv) <= 1 or sys.argv[1] == '--help': + usage() + sys.exit(0) + + pp = pprint.PrettyPrinter(indent=2) + host = 'localhost' + port = 9090 + argi = 1 + + if sys.argv[argi] == '-h': + parts = sys.argv[argi + 1].split(':') + host = parts[0] + if len(parts) == 2: + port = int(parts[1]) + argi += 2 + + cmd = sys.argv[argi] + args = sys.argv[argi + 1:] + + connection = HBaseConnectionForCrashReports( + host, + port, + 5000, + logger=utl.FakeLogger() + ) + + if cmd == 'get_report': + if len(args) != 1: + usage() + sys.exit(1) + pp.pprint(connection.get_report(*args)) + + elif cmd == 'get_json': + if len(args) < 1: + usage() + sys.exit(1) + old = len(args) == 2 + ppjson(connection.get_json(args[0], old)) + + elif cmd == 'get_dump': + if len(args) != 1: + usage() + sys.exit(1) + print(connection.get_dump(*args)) + + elif cmd == 'get_dumps': + if len(args) != 1: + usage() + sys.exit(1) + dumps = connection.get_dumps(*args) + for k, v in dumps.iteritems(): + print "%s: dump length = %s" % (k, len(v)) + + + elif cmd == 'get_processed_json': + if len(args) != 1: + usage() + sys.exit(1) + ppjson(connection.get_processed_json(*args)) + + elif cmd == 'get_report_processing_state': + if len(args) != 1: + usage() + sys.exit(1) + pp.pprint(connection.get_report_processing_state(*args)) + + elif cmd == 'union_scan_with_prefix': + if len(args) < 3: + usage() + sys.exit(1) + columns = args[2].split(',') + if len(args) > 3: + limit = int(args[3]) + else: + limit = 10 + for row in connection.limited_iteration( + connection.union_scan_with_prefix(args[0], args[1], columns), + limit + ): + ppjson(row) + + elif cmd == 'merge_scan_with_prefix': + if len(args) < 3: + usage() + sys.exit(1) + columns = args[2].split(',') + if len(args) > 3: + limit = int(args[3]) + else: + limit = 10 + for row in connection.limited_iteration( + connection.merge_scan_with_prefix(args[0], args[1], columns), + limit + ): + ppjson(row) + + elif cmd == 'put_json_dump': + if len(args) != 3: + usage() + sys.exit(1) + ppjson(connection.put_json_dump(*args)) + + elif cmd == 'put_json_dump_from_files': + if len(args) != 3: + usage() + sys.exit(1) + ppjson(connection.put_json_dump_from_files(*args)) + + elif cmd == 'export_jsonz_for_date': + if len(args) != 2: + usage() + sys.exit(1) + connection.export_jsonz_for_date(*args) + + elif cmd == 'export_jsonz_tarball_for_date': + if len(args) != 3: + usage() + sys.exit(1) + connection.export_jsonz_tarball_for_date(*args) + + elif cmd == 'export_jsonz_tarball_for_ooids': + if len(args) != 2: + usage() + sys.exit(1) + connection.export_jsonz_tarball_for_ooids(*args) + + elif cmd == 'export_sampled_crashes_tarball_for_dates': + if len(args) != 4: + usage() + sys.exit(1) + connection.export_sampled_crashes_tarball_for_dates(*args) + + elif cmd == 'describe_table': + if len(args) != 1: + usage() + sys.exit(1) + pp.pprint(connection.describe_table(*args)) + + elif cmd == 'get_full_row': + if len(args) != 2: + usage() + sys.exit(1) + pp.pprint(connection.get_full_row(*args)) + + else: + usage() + sys.exit(1) + + connection.close() + +# vi: sw=2 ts=2 diff --git a/socorro/external/postgresql/tcbs.py b/socorro/external/postgresql/tcbs.py index 90037f5b10..3571e2a5bd 100644 --- a/socorro/external/postgresql/tcbs.py +++ b/socorro/external/postgresql/tcbs.py @@ -139,6 +139,7 @@ def getListOfTopCrashersBySignature(connection, dbParams): order_by, dbParams["limit"] ) + cursor = connection.cursor() params = ( dbParams['product'], dbParams['version'], @@ -146,7 +147,6 @@ def getListOfTopCrashersBySignature(connection, dbParams): dbParams['to_date'], ) try: - cursor = connection.cursor() return db.execute(cursor, sql, params) except Exception: connection.rollback() diff --git a/socorro/lib/ConfigurationManager.py b/socorro/lib/ConfigurationManager.py new file mode 100644 index 0000000000..72a7025b27 --- /dev/null +++ b/socorro/lib/ConfigurationManager.py @@ -0,0 +1,522 @@ +#!/usr/bin/python +# +# Copyright 2004 by Centaur Software Engineering, Inc. +# +# +# This file is part of The CSE Python Library. +# +# The CSE Python Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# The CSE Python Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with The CSE Python Library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + + +version = "1.3" + +import getopt +import os +import sys +import cStringIO +import datetime + +from socorro.lib.datetimeutil import string_to_datetime + +#============================================================================================ +class ConfigFileMissingError (IOError): + pass +ConfigurationManagerConfigFileMissing = ConfigFileMissingError # for legacy compatability +#============================================================================================ +class ConfigFileOptionNameMissingError (Exception): + pass +ConfigurationManagerConfigFileOptionNameMissing = ConfigFileOptionNameMissingError # for legacy compatability +#============================================================================================ +class NotAnOptionError (Exception): + pass +ConfigurationManagerNotAnOption = NotAnOptionError # for legacy compatability + +#============================================================================================ +class OptionError (Exception): + def __init__(self, errorString): + super(OptionError, self).__init__(errorString) + +#============================================================================================ +class CannotConvert (ValueError): + pass + +#============================================================================================ +class Option(object): + pass + +class Internal(object): + pass + +def getDefaultedConfigOptions(): + """ + Used in Config subclass so as to pass only appropriate args to its super() + Is a function as a way to decouple the empty configurationOptionsList from prior invocations + """ + return { + 'configurationOptionsList':[], + 'optionNameForConfigFile':'config', + 'configurationFileRequired':False, + 'configurationModule':None, + 'automaticHelp':True, + 'applicationName':'', + 'helpHandler':None, + } + +namedConfigOptions = [ + 'configurationOptionsList', + 'optionNameForConfigFile', + 'ConfigurationFileRequired', + 'configurationModule', + 'automaticHelp', + 'applicationName', + 'helpHandler', + ] + +#------------------------------------------------------------------------------------------ +def newConfiguration(**kwargs): + """ This used as an alternate constructor for class Config so that applications can + be lax in defining all the required paramters in the right order. + """ + kw = getDefaultedConfigOptions() + kw.update(kwargs) + return Config(**kw) + + #============================================================================================ +class Config (dict): + """This class encapsulates the process of getting command line arguments, configuration files and environment variables into a program. + It wraps the Python getopt module and provides a more comprehensive interface. + """ + + #------------------------------------------------------------------------------------------ + def __init__ (self, configurationOptionsList=[], optionNameForConfigFile="config", configurationFileRequired=False, configurationModule=None, automaticHelp=False, applicationName='', helpHandler=None): + """Initialize a new instance. + + Input Parameters: + configurationOptionsList: a list of tuples that represent the options available to the user. The tuples take this form: + (singleCharacterOptionForm, longOptionForm, takesParametersBoolean, defaultValue, humanReadableOptionExplanation [, optionalListOfOptionParameterPairs | conversionFunction ]) + The optionalListOfOptionParameterPairs to create short cuts for sets of options. The form is a list of two element tuples specifying some other option + and its value. Examples can be seen below. + conversionFunction is a function that will take a string and convert it to the target type needed for the parameter + optionNameForConfigFile: the name of the option that stores the pathname of the configuration file - if this is set to None then we assume there + is no configuration file and one will not be tried + configurationFileRequired: if True, the lack of a configuration file is considered a fatal error + """ + self.internal = Internal() + self.internal.originalConfigurationOptionsList = configurationOptionsList + self.internal.applicationName = applicationName + # incorporate config options from configuration module + try: + for key, value in configurationModule.__dict__.items(): + if type(value) == Option: + optionDefinition = [] + try: + optionDefinition.append(value.singleCharacter) #0 + except AttributeError: + optionDefinition.append(None) + optionDefinition.append(key) #1 + optionDefinition.append(True) #2 + try: + optionDefinition.append(value.default) #3 + except AttributeError: + optionDefinition.append(None) + try: + optionDefinition.append(value.doc) #4 + except AttributeError: + optionDefinition.append("%s imported from %s" % (key, configurationModule.__name__)) + try: + optionDefinition.append(value.fromStringConverter) #5 + except AttributeError: + pass + configurationOptionsList.append(optionDefinition) + else: + if key[:2] != "__" and type(value) != type(os): + configurationOptionsList.append([None, key, True, value, "%s imported from %s" % (key, configurationModule.__name__)]) + except AttributeError: + pass #we're apparently not using an initialization module + + self.internal.singleLetterCommandLineOptionsForGetopt = "" + self.internal.expandedCommandLineOptionsForGetopt = [] + + self.internal.allowableOptionDictionary = {} + self.internal.allowableLongFormOptionDictionary = {} + for x in configurationOptionsList: + if x[0]: + self.internal.allowableOptionDictionary[x[0]] = x + self.internal.allowableOptionDictionary[x[1]] = self.internal.allowableLongFormOptionDictionary[x[1]] = x + self.__addOptionsForGetopt(x) + + # add autohelp if needed + if automaticHelp and ("help" not in self.internal.allowableLongFormOptionDictionary): + helpOptionTuple = ('?', 'help', False, None, 'print this list') + configurationOptionsList.append(helpOptionTuple) + self.internal.allowableOptionDictionary[helpOptionTuple[0]] = helpOptionTuple + self.internal.allowableOptionDictionary[helpOptionTuple[1]] = self.internal.allowableLongFormOptionDictionary[helpOptionTuple[1]] = helpOptionTuple + self.__addOptionsForGetopt(helpOptionTuple) + + # handle help requests appropriately + self.internal.helpHandler = self.__nothingHelpHandler # default is no autohelp + if helpHandler: # if user handed us one, use it + self.internal.helpHandler = helpHandler + elif "help" in self.internal.allowableLongFormOptionDictionary: # if needed, use default + self.internal.helpHandler = self.__defaultHelpHandler + + # setup all defaults for options: + for x in configurationOptionsList: + #if x[2] and x[3] is not None: + if x[2]: + self[x[1]] = x[3] + + # get options from the environment - these override defaults + for x in os.environ: + if self.internal.allowableOptionDictionary.has_key(x): + self[self.internal.allowableOptionDictionary[x][1]] = os.environ.get(x) + self.__insertCombinedOption(x, self) + + # get the options from the command line - these will eventually override all other methods of setting options + try: + options, ignoreArgs = getopt.getopt(sys.argv[1:], self.internal.singleLetterCommandLineOptionsForGetopt, self.internal.expandedCommandLineOptionsForGetopt) + except getopt.GetoptError, e: + pass #TODO - temporary measure + #raise NotAnOptionError, e + commandLineEnvironment = {} # save these options for merging later + for x in options: + if len(x[0]) == 2: #single letter option + longFormOfSingleLetterOption = self.internal.allowableOptionDictionary[x[0][1]][1] + if self.internal.allowableOptionDictionary[longFormOfSingleLetterOption][2]: + commandLineEnvironment[longFormOfSingleLetterOption] = x[1] + else: + commandLineEnvironment[longFormOfSingleLetterOption] = None + self.__insertCombinedOption(longFormOfSingleLetterOption, commandLineEnvironment) + else: + longFormOption = x[0][2:] + if self.internal.allowableOptionDictionary[longFormOption][2]: + commandLineEnvironment[longFormOption] = x[1] + else: + commandLineEnvironment[longFormOption] = None + self.__insertCombinedOption(longFormOption, commandLineEnvironment) + + # get any options from the config file + # any options already set in the environment are overridden + if optionNameForConfigFile is not None: + configFile = None + try: + try: + try: + configFile = open(commandLineEnvironment[optionNameForConfigFile], 'r') + except KeyError: + configFile = open(self[optionNameForConfigFile], 'r') + except IOError, e: + raise ConfigFileMissingError() + for x in configFile: + x = x.strip() + if not x or x[0] == '#' : continue + key,value = x.split('=', 1) + key = key.rstrip() + if not key: continue + value = value.lstrip() + if self.internal.allowableOptionDictionary.has_key(key): + longFormOption = self.internal.allowableOptionDictionary[key][1] + self.__insertCombinedOption(longFormOption, self) + try: + self[longFormOption] = value + except IndexError: + self[longFormOption] = None + else: + raise NotAnOptionError, "option '%s' in the config file is not recognized" % key + except KeyError,x: + if configurationFileRequired: + raise ConfigFileOptionNameMissingError() + except IOError: + if configurationFileRequired: + raise ConfigFileMissingError() + finally: + if configFile: configFile.close() + + # merge command line options with the workingEnvironment + # any options already specified in the environment or + # configuration file are overridden. + for x in commandLineEnvironment: + self[x] = commandLineEnvironment[x] + + # mix in combo commandline arguments + for optionTuple in self.internal.allowableLongFormOptionDictionary.values(): + try: + if type(optionTuple[5]) == list and optionTuple[1] in self: + for longFormOptionFromCombo, valueFromCombo in optionTuple[5]: + self[longFormOptionFromCombo] = valueFromCombo + except IndexError: + pass #not a combo option + + # make sure that non-string values in the workingEnvironment + # have the right type. Assume the default value has the right + # type and cast the existing value to that type iff no conversion + # function was supplied + for optionTuple in self.internal.allowableLongFormOptionDictionary.values(): + try: + conversionFunction = optionTuple[5] + except IndexError: + conversionFunction = type(optionTuple[3]) + if conversionFunction not in (str, list, type(None)): + try: + self[optionTuple[1]] = conversionFunction(self[optionTuple[1]]) + except (KeyError, TypeError): + pass + except ValueError, x: + raise CannotConvert(str(x)) + + # do help (auto or otherwise) + if 'help' in self: + self.internal.helpHandler(self) + + #------------------------------------------------------------------------------------------ + def __nothingHelpHandler(self, config): + pass + + #------------------------------------------------------------------------------------------ + def __defaultHelpHandler(self, config): + if self.internal.applicationName: + print >>sys.stderr, self.internal.applicationName + self.outputCommandSummary(sys.stderr, 1) + sys.exit() + + #------------------------------------------------------------------------------------------ + def __addOptionsForGetopt (self, optionTuple): + """Internal Use - during setup, this function sets up internal structures with a new optionTuple. + + Parameters: + optionTuple: a tuple of the form - (singleCharacterOptionForm, longOptionForm, takesParametersBoolean, ...) + """ + if optionTuple[2]: #does this option have parameters? + if optionTuple[0]: + self.internal.singleLetterCommandLineOptionsForGetopt = "%s%s:" % (self.internal.singleLetterCommandLineOptionsForGetopt, optionTuple[0]) + self.internal.expandedCommandLineOptionsForGetopt.append("%s=" % optionTuple[1]) + else: + if optionTuple[0]: + self.internal.singleLetterCommandLineOptionsForGetopt = "%s%s" % (self.internal.singleLetterCommandLineOptionsForGetopt, optionTuple[0]) + self.internal.expandedCommandLineOptionsForGetopt.append(optionTuple[1]) + + #------------------------------------------------------------------------------------------ + def __insertCombinedOption (self, anOption, theDictionaryToInsertInto): + """Internal Use - during setup, maybe set short-cut option(s) from the allowableOptionDictionary + + Parameters: + option: key into the allowableOptionDictionary + Action: + If the key is found, look for optional (key,value) pairs that define this option as a short-cut for one or more defaults. + For each short-cut found, set the short-cut key and value in the given dictionary. + """ + try: + for x in self.internal.allowableOptionDictionary[anOption][5]: + theDictionaryToInsertInto[x[0]] = x[1] + except (KeyError, IndexError, TypeError) : + pass + + #------------------------------------------------------------------------------------------ + def dumpAllowableOptionDictionary(self): + """ for debugging and understanding what the heck is going on + """ + try: + for k in self.internal.allowableOptionDictionary.keys(): + v = self.internal.allowableOptionDictionary.get(k) + print "%-8s (%d) %s" % (k,len(v),str(v)) + except: + print 'No dictionary available' + + #------------------------------------------------------------------------------------------ + def outputCommandSummary (self, outputStream=sys.stdout, sortOption=0, outputTemplateForOptionsWithParameters="--%s\n\t\t%s (default: %s)", + outputTemplateForOptionsWithoutParameters="--%s\n\t\t%s", + outputTemplatePrefixForSingleLetter="\t-%s, ", + outputTemplatePrefixForNoSingleLetter="\t "): + """outputs the list of acceptable commands. This is useful as the output of the 'help' option or usage. + + Parameters: + outputStream: where to send the output + sortOption: 0 - sort by the single character option + 1 - sort by the long option + outputTemplateForOptionsWithParameters: a string template for outputing options that have parameters from the long form onward + outputTemplateForOptionsWithoutParameters: a string template for outputing options that have no parameters from the long form onward + outputTemplatePrefixForSingleLetter: a string template for the first part of a listing where there is a single letter form of the command + outputTemplatePrefixForNo: a string template for the first part of a listing where there is no single letter form of the command + """ + optionsList = [ x for x in self.internal.originalConfigurationOptionsList ] + optionsList.sort(lambda a, b: (a[sortOption] > b[sortOption]) or -(a[sortOption] < b[sortOption])) + for x in optionsList: + if x[0]: + prefix = outputTemplatePrefixForSingleLetter + commandTuple = (x[0], x[1], x[4], x[3]) + else: + prefix = outputTemplatePrefixForNoSingleLetter + commandTuple = (x[1], x[4], x[3]) + if x[2]: + print >>outputStream, ("%s%s" % (prefix, outputTemplateForOptionsWithParameters)) % commandTuple + else: + print >>outputStream, ("%s%s" % (prefix, outputTemplateForOptionsWithoutParameters)) % commandTuple[:-1] + + #------------------------------------------------------------------------------------------ + def output (self, outputStream=sys.stdout, outputTemplateForOptionsWithParameters="\t%s=%s", outputTemplateForOptionsWithoutParameters="\t%s", blockPassword=True): + """this routine will write the current values of all options to an output stream. + + Parameters: + outputStream: where to write the output + outputTemplateForOptionsWithParameters: a string template for outputing options that have parameters + outputTemplateForOptionsWithoutParameters: a string template for outputing options that have no parameters + blockPassword: a boolean controlling the output of options that have the string 'password' in their name + True - the value will be printed as ********** + False - the value will print normally + """ + environmentList = [x for x in self.iteritems() ] + environmentList.sort(lambda x, y: (x[0] > y[0]) or -(x[0] < y[0])) + for x in environmentList: + if blockPassword and x[1] is not None and "password" in x[0].lower(): + print >>outputStream, outputTemplateForOptionsWithParameters % (x[0], "*" * 10) + continue + if x[1] is not None: + print >>outputStream, outputTemplateForOptionsWithParameters % x + else: + print >>outputStream, outputTemplateForOptionsWithoutParameters % x[0] + + #------------------------------------------------------------------------------------------ + def __str__ (self): + """ return a string representation of the options and their states. + """ + stringio = cStringIO.StringIO() + self.output(stringio) + s = stringio.getvalue() + stringio.close() + return s + + #------------------------------------------------------------------------------------------ + def __getattr__(self, name): + """ this function implements an interface allowing the entries in the dictionary + to be accessed using the dot operator: self["fred"] == self.fred + """ + try: + return self[name] + except KeyError, x: + raise AttributeError(x) + +ConfigurationManager = Config #for legacy compatibility + +#------------------------------------------------------------------------------------------ +def ioConverter(inputString): + """ a conversion function for to select stdout, stderr or open a file for writing + """ + if type(inputString) is str: + lowerInputString = inputString.lower() + if lowerInputString == 'stdout': + return sys.stdout + if lowerInputString == 'stderr': + return sys.stderr + return open(inputString, "w") + return inputString + +#------------------------------------------------------------------------------------------ +def dateTimeConverter(inputString): + """ a conversion function for datetimes + """ + return string_to_datetime(inputString) + +#------------------------------------------------------------------------------------------ +def timeDeltaConverter(inputString): + """ a conversion function for time deltas + """ + if type(inputString) is str: + days,hours,minutes,seconds = 0,0,0,0 + details = inputString.split(':') + if len(details) >= 4: + days = int(details[-4]) + if len(details) >= 3: + hours = int(details[-3]) + if len(details) >= 2: + minutes = int(details[-2]) + if len(details) >= 1: + seconds = int(details[-1]) + return datetime.timedelta(days = days, hours = hours, minutes = minutes, seconds = seconds) + return inputString + + +#------------------------------------------------------------------------------------------ +def booleanConverter(inputString): + """ a conversion function for boolean + """ + if type(inputString) is str: + return inputString.lower() in ("true", "t", "1") + return inputString + +#------------------------------------------------------------------------------- +def classConverter(input_str): + """ a conversion that will import a module and class name + """ + parts = input_str.split('.') + try: + # first try as a complete module + package = __import__(input_str) + except ImportError, x: + if len(parts) == 1: + # maybe this is a builtin + return eval(input_str) + # it must be a class from a module + package = __import__('.'.join(parts[:-1]), globals(), locals(), []) + obj = package + for name in parts[1:]: + obj = getattr(obj, name) + return obj + + +if __name__ == "__main__": + + def doubler (aString): + return float(aString) * 2 + + commandLineOptions = [ ('c', 'config', True, './config', "the config file"), + ('a', 'alpha', True, 600, "the alpha option takes an int"), + ('b', 'beta', True, 'hello', 'the beta option takes a string'), + ('g', 'gamma', False, None, "the gamma option accepts no parameter"), + ('f', 'floater', True, 3.1415, "the floater option"), + ('d', 'doubler', True, 3.1415, "the doubler option", doubler), + ('p', 'secretpassword', True, '', "the password - it won't print when listing configuration"), + ('o', 'ostream', True, 'stdout', 'output stream', ioConverter), + ('d', 'dt', True, '1960-05-04 15:10:00', 'aDateTime', dateTimeConverter), + ('l', 'timedelta', True, '123:11:16', 'output stream', timeDeltaConverter), + (None, 'noShort', False, None, 'only available as a long option'), + ('$', 'dollar', False, None, "combo of 'alpha=22, beta=10'", [ ("alpha", 22), ("beta", 10) ] ), + ('#', 'hash', False, None, "combo of 'alpha=2, beta=100, gamma, doubler=23'", [ ("alpha", 2), ("beta", 100), ("gamma", None), ("doubler", 23) ] ), + ] + + cm = newConfiguration(configurationOptionsList=commandLineOptions) + + print cm + cm.dumpAllowableOptionDictionary() +# print "AOPTDICT" +# for k in cm.allowableOptionDictionary.keys(): + + + + print cm.doubler + print cm.secretpassword + try: + print cm.dollar + except AttributeError: + print "try running with the -$ option for more exciting fun" + try: + print cm.hash + except AttributeError: + print "try running with the -# option for more exciting fun" + + + #import config + #cm = newConfiguration(configurationModule=config) + #print cm diff --git a/socorro/lib/psycopghelper.py b/socorro/lib/psycopghelper.py new file mode 100644 index 0000000000..7065df8be4 --- /dev/null +++ b/socorro/lib/psycopghelper.py @@ -0,0 +1,155 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" Deprecated Module """ + +import psycopg2 +import psycopg2.extensions +import datetime +import threading + +import socorro.lib.util as util + +#----------------------------------------------------------------------------------------------------------------- +def singleValueSql (aCursor, sql, parameters=None): + """ Deprecated """ + aCursor.execute(sql, parameters) + result = aCursor.fetchall() + try: + return result[0][0] + except Exception, x: + raise SQLDidNotReturnSingleValue("%s: %s" % (str(x), sql)) + +#----------------------------------------------------------------------------------------------------------------- +def singleRowSql (aCursor, sql, parameters=None): + """ Deprecated """ + aCursor.execute(sql, parameters) + result = aCursor.fetchall() + try: + return result[0] + except Exception, x: + raise SQLDidNotReturnSingleRow("%s: %s" % (str(x), sql)) + +#----------------------------------------------------------------------------------------------------------------- +def execute (aCursor, sql, parameters=None): + """ Deprecated """ + aCursor.execute(sql, parameters) + while True: + aRow = aCursor.fetchone() + if aRow is not None: + yield aRow + else: + break + +#================================================================================================================= +class LoggingCursor(psycopg2.extensions.cursor): + """Use as cursor_factory when getting cursor from connection: + ... + cursor = connection.cursor(cursor_factory = socorro.lib.pyscopghelper.LoggingCursor) + cursor.setLogger(someLogger) + ... + """ + #----------------------------------------------------------------------------------------------------------------- + def setLogger(self, logger): + self.logger = logger + self.logger.info("Now logging cursor") + #----------------------------------------------------------------------------------------------------------------- + def execute(self, sql, args=None): + try: + self.logger.info(self.mogrify(sql,args)) + except AttributeError: + pass + super(LoggingCursor, self).execute(sql,args) + def executemany(self,sql,args=None): + try: + try: + self.logger.info("%s ..." % (self.mogrify(sql,args[0]))) + except TypeError: + self.logger.info("%s ..." % (sql)) + except AttributeError: + pass + super(LoggingCursor,self).executemany(sql,args) + +#================================================================================================================= +class SQLDidNotReturnSingleValue (Exception): + pass + +#================================================================================================================= +class SQLDidNotReturnSingleRow (Exception): + pass + +#================================================================================================================= +class CannotConnectToDatabase(Exception): + pass + +#================================================================================================================= +class DatabaseConnectionPool(dict): + """ Deprecated """ + #----------------------------------------------------------------------------------------------------------------- + def __init__(self, databaseHostName, databaseName, databaseUserName, databasePassword, logger=util.FakeLogger()): + super(DatabaseConnectionPool, self).__init__() + if databaseHostName != '': + self.dsn = "host=%s dbname=%s user=%s password=%s" % (databaseHostName, databaseName, databaseUserName, databasePassword) + else: + self.dsn = "dbname=%s user=%s password=%s" % (databaseName, databaseUserName, databasePassword) + self.logger = logger + + #----------------------------------------------------------------------------------------------------------------- + def connectToDatabase(self): + """ Deliberately do NOT put the connection into the pool""" + threadName = threading.currentThread().getName() + try: + self.logger.info("%s - connecting to database", threadName) + connection = psycopg2.connect(self.dsn) + return (connection, connection.cursor()) + except Exception, x: + self.logger.critical("%s - cannot connect to the database", threadName) + raise CannotConnectToDatabase(x) + + #----------------------------------------------------------------------------------------------------------------- + def connectionCursorPairNoTest(self): + """Try to re-use this thread's connection, else create one and use that""" + threadName = threading.currentThread().getName() + try: + return self[threadName] + except KeyError: + self[threadName] = self.connectToDatabase() + return self[threadName] + + #----------------------------------------------------------------------------------------------------------------- + def connectionCursorPair(self): + """Like connecionCursorPairNoTest, but test that the specified connection actually works""" + connection, cursor = self.connectionCursorPairNoTest() + try: + cursor.execute("select 1") + cursor.fetchall() + return (connection, cursor) + #except (psycopg2.OperationalError, psycopg2.ProgrammingError): + except psycopg2.Error: + # did the connection time out? + self.logger.info("%s - trying to re-establish a database connection", threading.currentThread().getName()) + try: + del self[threading.currentThread().getName()] + connection, cursor = self.connectionCursorPairNoTest() + cursor.execute("select 1") + cursor.fetchall() + return (connection, cursor) + #except (psycopg2.OperationalError, psycopg2.ProgrammingError): + except Exception, x: + self.logger.critical("%s - something's gone horribly wrong with the database connection", threading.currentThread().getName()) + raise CannotConnectToDatabase(x) + + #----------------------------------------------------------------------------------------------------------------- + def cleanup (self): + self.logger.debug("%s - killing thread database connections", threading.currentThread().getName()) + for i, aDatabaseConnectionPair in self.iteritems(): + try: + aDatabaseConnectionPair[0].rollback() + aDatabaseConnectionPair[0].close() + self.logger.debug("%s - connection %s closed", threading.currentThread().getName(), i) + except psycopg2.InterfaceError: + self.logger.debug("%s - connection %s already closed", threading.currentThread().getName(), i) + except: + util.reportExceptionAndContinue(self.logger) + diff --git a/socorro/monitor/__init__.py b/socorro/monitor/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/socorro/monitor/crashstore_new_crash_source.py b/socorro/monitor/crashstore_new_crash_source.py new file mode 100644 index 0000000000..6d69f6839d --- /dev/null +++ b/socorro/monitor/crashstore_new_crash_source.py @@ -0,0 +1,31 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from configman import Namespace, RequiredConfig +from configman.converters import class_converter + + +#============================================================================== +class CrashStorageNewCrashSource(RequiredConfig): + """this class provides an iterator that will pull from any crash + storage class new_ooids generator""" + required_config = Namespace() + required_config.add_option( + 'crashstorage_class', + doc="the class of the crashstorage system", + default='socorro.external.filesystem.crashstorage.' + 'FileSystemRawCrashStorage', + from_string_converter=class_converter + ) + + def __init__(self, config, name, quit_check_callback=None): + """'name' is unused by this class but is present for api consistency""" + self.config = config + self.crash_store = config.crashstorage_class(config) + + def __call__(self): + return self.crash_store.new_crashes() + + def close(self): + self.crash_store.close() diff --git a/socorro/monitor/monitor.py b/socorro/monitor/monitor.py new file mode 100755 index 0000000000..42c9d374cf --- /dev/null +++ b/socorro/monitor/monitor.py @@ -0,0 +1,512 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +import psycopg2 + +import time +import signal +import threading + +import logging + +logger = logging.getLogger("monitor") + +import socorro.lib.util +import socorro.external.filesystem.filesystem +import socorro.lib.psycopghelper as psy +import socorro.database.database as sdb +import socorro.storage.crashstorage as cstore +import socorro.external.hbase.hbase_client as hbc + +from socorro.lib.datetimeutil import utc_now + +#================================================================================================================= +class UuidNotFoundException(Exception): + pass + +#================================================================================================================= +class Monitor (object): + _config_requirements = ("databaseHost", + "databaseName", + "databaseUserName", + "databasePassword", + "processorCheckInTime", + "standardLoopDelay", + "cleanupJobsLoopDelay", + "priorityLoopDelay", + "hbaseHost", + "hbasePort", + ) + + #----------------------------------------------------------------------------------------------------------------- + def __init__(self, config, logger=logger, sdb=sdb, cstore=cstore, signal=signal): + super(Monitor, self).__init__() + config.logger = logger + + for x in Monitor._config_requirements: + assert x in config, '%s missing from configuration' % x + + self.crashStorePool = cstore.CrashStoragePool(config) + + self.sdb = sdb + + self.standardLoopDelay = config.standardLoopDelay.seconds + self.cleanupJobsLoopDelay = config.cleanupJobsLoopDelay.seconds + self.priorityLoopDelay = config.priorityLoopDelay.seconds + + self.databaseConnectionPool = self.sdb.DatabaseConnectionPool(config, logger) + + self.config = config + signal.signal(signal.SIGTERM, Monitor.respondToSIGTERM) + signal.signal(signal.SIGHUP, Monitor.respondToSIGTERM) + + self.quit = False + + #----------------------------------------------------------------------------------------------------------------- + class NoProcessorsRegisteredException (Exception): + pass + + #----------------------------------------------------------------------------------------------------------------- + @staticmethod + def respondToSIGTERM(signalNumber, frame): + """ these classes are instrumented to respond to a KeyboardInterrupt by cleanly shutting down. + This function, when given as a handler to for a SIGTERM event, will make the program respond + to a SIGTERM as neatly as it responds to ^C. + """ + signame = 'SIGTERM' + if signalNumber != signal.SIGTERM: signame = 'SIGHUP' + logger.info("%s detected", signame) + raise KeyboardInterrupt + + #----------------------------------------------------------------------------------------------------------------- + def quitCheck(self): + if self.quit: + raise KeyboardInterrupt + + #----------------------------------------------------------------------------------------------------------------- + def responsiveSleep (self, seconds): + for x in xrange(int(seconds)): + self.quitCheck() + time.sleep(1.0) + + #----------------------------------------------------------------------------------------------------------------- + def getDatabaseConnectionPair (self): + try: + connection = self.databaseConnectionPool.connection() + cursor = connection.cursor() + return (connection, cursor) + except self.sdb.CannotConnectToDatabase: + self.quit = True + self.databaseConnectionPool.cleanup() + socorro.lib.util.reportExceptionAndAbort(logger) # can't continue without a database connection + + #----------------------------------------------------------------------------------------------------------------- + def cleanUpCompletedAndFailedJobs (self): + logger.debug("dealing with completed and failed jobs") + # check the jobs table to and deal with the completed and failed jobs + databaseConnection, databaseCursor = self.getDatabaseConnectionPair() + try: + logger.debug("starting deletion") + databaseCursor.execute("""delete from jobs + where + uuid in (select + uuid + from + jobs j + where + j.success is not null) + """) + databaseConnection.commit() + logger.debug("end of this cleanup iteration") + except Exception, x: + logger.debug("it died: %s", x) + databaseConnection.rollback() + socorro.lib.util.reportExceptionAndContinue(logger) + + #----------------------------------------------------------------------------------------------------------------- + def cleanUpDeadProcessors (self, aCursor): + """ look for dead processors - find all the jobs of dead processors and assign them to live processors + then delete the dead processors + """ + logger.info("looking for dead processors") + try: + logger.info("threshold %s", self.config.processorCheckInTime) + threshold = psy.singleValueSql(aCursor, "select now() - interval '%s' * 2" % self.config.processorCheckInTime) + #sql = "select id from processors where lastSeenDateTime < '%s'" % (threshold,) + #logger.info("dead processors sql: %s", sql) + aCursor.execute("select id from processors where lastSeenDateTime < '%s'" % (threshold,)) + deadProcessors = aCursor.fetchall() + aCursor.connection.commit() + logger.info("dead processors: %s", str(deadProcessors)) + if deadProcessors: + logger.info("found dead processor(s):") + for aDeadProcessorTuple in deadProcessors: + logger.info("%d is dead", aDeadProcessorTuple[0]) + stringOfDeadProcessorIds = ", ".join([str(x[0]) for x in deadProcessors]) + logger.info("getting list of live processor(s):") + aCursor.execute("select id from processors where lastSeenDateTime >= '%s'" % threshold) + liveProcessors = aCursor.fetchall() + if not liveProcessors: + raise Monitor.NoProcessorsRegisteredException("There are no processors registered") + numberOfLiveProcessors = len(liveProcessors) + logger.info("getting range of queued date for jobs associated with dead processor(s):") + aCursor.execute("select min(queueddatetime), max(queueddatetime) from jobs where owner in (%s)" % stringOfDeadProcessorIds) + earliestDeadJob, latestDeadJob = aCursor.fetchall()[0] + if earliestDeadJob is not None and latestDeadJob is not None: + timeIncrement = (latestDeadJob - earliestDeadJob) / numberOfLiveProcessors + for x, liveProcessorId in enumerate(liveProcessors): + lowQueuedTime = x * timeIncrement + earliestDeadJob + highQueuedTime = (x + 1) * timeIncrement + earliestDeadJob + logger.info("assigning jobs from %s to %s to processor %s:", str(lowQueuedTime), str(highQueuedTime), liveProcessorId) + # why is the range >= at both ends? the range must be inclusive, the risk of moving a job twice is low and consequences low, too. + # 1st step: take any jobs of a dead processor that were in progress and reset them to unprocessed + aCursor.execute("""update jobs + set starteddatetime = NULL + where + %%s >= queueddatetime + and queueddatetime >= %%s + and owner in (%s) + and success is NULL""" % stringOfDeadProcessorIds, (highQueuedTime, lowQueuedTime)) + # 2nd step: take all jobs of a dead processor and give them to a new owner + aCursor.execute("""update jobs + set owner = %%s + where + %%s >= queueddatetime + and queueddatetime >= %%s + and owner in (%s)""" % stringOfDeadProcessorIds, (liveProcessorId, highQueuedTime, lowQueuedTime)) + aCursor.connection.commit() + #3rd step - transfer stalled priority jobs to new processor + for deadProcessorTuple in deadProcessors: + logger.info("re-assigning priority jobs from processor %d:", deadProcessorTuple[0]) + try: + aCursor.execute("""insert into priorityjobs (uuid) select uuid from priority_jobs_%d""" % deadProcessorTuple) + aCursor.connection.commit() + except: + aCursor.connection.rollback() + logger.info("removing all dead processors") + aCursor.execute("delete from processors where lastSeenDateTime < '%s'" % threshold) + aCursor.connection.commit() + # remove dead processors' priority tables + for aDeadProcessorTuple in deadProcessors: + try: + aCursor.execute("drop table priority_jobs_%d" % aDeadProcessorTuple[0]) + aCursor.connection.commit() + except: + logger.warning("cannot clean up dead processor in database: the table 'priority_jobs_%d' may need manual deletion", aDeadProcessorTuple[0]) + aCursor.connection.rollback() + except Monitor.NoProcessorsRegisteredException: + self.quit = True + socorro.lib.util.reportExceptionAndAbort(logger, showTraceback=False) + except: + socorro.lib.util.reportExceptionAndContinue(logger) + + #----------------------------------------------------------------------------------------------------------------- + @staticmethod + def compareSecondOfSequence (x, y): + return cmp(x[1], y[1]) + + #----------------------------------------------------------------------------------------------------------------- + #@staticmethod + #def secondOfSequence(x): + #return x[1] + + #----------------------------------------------------------------------------------------------------------------- + def jobSchedulerIter(self, aCursor): + """ This takes a snap shot of the state of the processors as well as the number of jobs assigned to each + then acts as an iterator that returns a sequence of processor ids. Order of ids returned will assure that + jobs are assigned in a balanced manner + """ + logger.debug("balanced jobSchedulerIter: compiling list of active processors") + try: + sql = """select + p.id, + count(j.owner) + from + processors p left join jobs j on p.id = j.owner + and p.lastSeenDateTime > now() - interval %s + and j.success is null + group by p.id""" + try: + aCursor.execute(sql, (self.config.processorCheckInTime,) ) + logger.debug("sql succeeded") + aCursor.connection.commit() + except psycopg2.ProgrammingError: + logger.debug("some other database transaction failed and didn't close properly. Roll it back and try to continue.") + try: + aCursor.connection.rollback() + aCursor.execute(sql) + except: + logger.debug("sql failed for the 2nd time - quit") + self.quit = True + aCursor.connection.rollback() + socorro.lib.util.reportExceptionAndAbort(logger) + listOfProcessorIds = [[aRow[0], aRow[1]] for aRow in aCursor.fetchall()] #processorId, numberOfAssignedJobs + logger.debug("listOfProcessorIds: %s", str(listOfProcessorIds)) + if not listOfProcessorIds: + raise Monitor.NoProcessorsRegisteredException("There are no processors registered") + while True: + logger.debug("sort the list of (processorId, numberOfAssignedJobs) pairs") + listOfProcessorIds.sort(Monitor.compareSecondOfSequence) + # the processor with the fewest jobs is about to be assigned a new job, so increment its count + listOfProcessorIds[0][1] += 1 + logger.debug("yield the processorId which had the fewest jobs: %d", listOfProcessorIds[0][0]) + yield listOfProcessorIds[0][0] + except Monitor.NoProcessorsRegisteredException: + self.quit = True + socorro.lib.util.reportExceptionAndAbort(logger) + + #----------------------------------------------------------------------------------------------------------------- + def unbalancedJobSchedulerIter(self, aCursor): + """ This generator returns a sequence of active processorId without regard to job balance + """ + logger.debug("unbalancedJobSchedulerIter: compiling list of active processors") + try: + threshold = psy.singleValueSql( aCursor, "select now() - interval '%s'" % self.config.processorCheckInTime) + aCursor.execute("select id from processors where lastSeenDateTime > '%s'" % threshold) + listOfProcessorIds = [aRow[0] for aRow in aCursor.fetchall()] + if not listOfProcessorIds: + raise Monitor.NoProcessorsRegisteredException("There are no active processors registered") + while True: + for aProcessorId in listOfProcessorIds: + yield aProcessorId + except Monitor.NoProcessorsRegisteredException: + self.quit = True + socorro.lib.util.reportExceptionAndAbort(logger) + + #----------------------------------------------------------------------------------------------------------------- + def queueJob (self, databaseCursor, uuid, processorIdSequenceGenerator, priority=0): + logger.debug("trying to insert %s", uuid) + processorIdAssignedToThisJob = processorIdSequenceGenerator.next() + try: + databaseCursor.execute("insert into jobs (pathname, uuid, owner, priority, queuedDateTime) values (%s, %s, %s, %s, %s)", + ('', uuid, processorIdAssignedToThisJob, priority, utc_now())) + logger.debug("executed insert for %s", uuid) + databaseCursor.connection.commit() + except: + databaseCursor.connection.rollback() + raise + logger.debug("%s assigned to processor %d", uuid, processorIdAssignedToThisJob) + return processorIdAssignedToThisJob + + #----------------------------------------------------------------------------------------------------------------- + def queuePriorityJob (self, databaseCursor, uuid, processorIdSequenceGenerator): + processorIdAssignedToThisJob = self.queueJob(databaseCursor, uuid, processorIdSequenceGenerator, priority=1) + if processorIdAssignedToThisJob: + databaseCursor.execute("insert into priority_jobs_%d (uuid) values ('%s')" % (processorIdAssignedToThisJob, uuid)) + databaseCursor.execute("delete from priorityjobs where uuid = %s", (uuid,)) + databaseCursor.connection.commit() + return processorIdAssignedToThisJob + + #----------------------------------------------------------------------------------------------------------------- + def standardJobAllocationLoop(self): + """ + """ + try: + crashStorage = self.crashStorePool.crashStorage() + except hbc.NoConnectionException: + self.quit = True + logger.critical("hbase is gone! hbase is gone!") + socorro.lib.util.reportExceptionAndAbort(logger) + except Exception: + self.quit = True + socorro.lib.util.reportExceptionAndContinue(logger) + raise + try: + try: + databaseConnection = None + while (True): + databaseConnection, databaseCursor = self.getDatabaseConnectionPair() + self.cleanUpDeadProcessors(databaseCursor) + self.quitCheck() + # walk the dump indexes and assign jobs + logger.debug("getting jobSchedulerIter") + processorIdSequenceGenerator = self.jobSchedulerIter(databaseCursor) + logger.debug("beginning index scan") + try: + logger.debug("starting destructiveDateWalk") + for uuid in crashStorage.newUuids(): + try: + logger.debug("looping: %s", uuid) + self.quitCheck() + self.queueJob(databaseCursor, uuid, processorIdSequenceGenerator) + except KeyboardInterrupt: + logger.debug("inner detects quit") + self.quit = True + raise + except: + socorro.lib.util.reportExceptionAndContinue(logger) + logger.debug("ended destructiveDateWalk") + except hbc.FatalException: + raise + except: + socorro.lib.util.reportExceptionAndContinue(logger, loggingLevel=logging.CRITICAL) + logger.debug("end of loop - about to sleep") + self.quitCheck() + self.responsiveSleep(self.standardLoopDelay) + except hbc.FatalException, x: + logger.debug("somethings gone horribly wrong with HBase") + socorro.lib.util.reportExceptionAndContinue(logger, loggingLevel=logging.CRITICAL) + if databaseConnection is not None: + databaseConnection.rollback() + self.quit = True + except (KeyboardInterrupt, SystemExit): + logger.debug("outer detects quit") + if databaseConnection is not None: + databaseConnection.rollback() + self.quit = True + raise + finally: + if databaseConnection is not None: + databaseConnection.close() + logger.debug("standardLoop done.") + + #----------------------------------------------------------------------------------------------------------------- + def getPriorityUuids(self, aCursor): + aCursor.execute("select * from priorityjobs;") + setOfPriorityUuids = set() + for aUuidRow in aCursor.fetchall(): + setOfPriorityUuids.add(aUuidRow[0]) + return setOfPriorityUuids + + #----------------------------------------------------------------------------------------------------------------- + def lookForPriorityJobsAlreadyInQueue(self, databaseCursor, setOfPriorityUuids): + # check for uuids already in the queue + for uuid in list(setOfPriorityUuids): + self.quitCheck() + try: + prexistingJobOwner = psy.singleValueSql(databaseCursor, "select owner from jobs where uuid = '%s'" % uuid) + logger.info("priority job %s was already in the queue, assigned to %d", uuid, prexistingJobOwner) + try: + databaseCursor.execute("insert into priority_jobs_%d (uuid) values ('%s')" % (prexistingJobOwner, uuid)) + except psycopg2.ProgrammingError: + logger.debug("%s assigned to dead processor %d - wait for reassignment", uuid, prexistingJobOwner) + # likely that the job is assigned to a dead processor + # skip processing it this time around - by next time hopefully it will have been + # re assigned to a live processor + databaseCursor.connection.rollback() + setOfPriorityUuids.remove(uuid) + continue + databaseCursor.execute("delete from priorityjobs where uuid = %s", (uuid,)) + databaseCursor.connection.commit() + setOfPriorityUuids.remove(uuid) + except psy.SQLDidNotReturnSingleValue: + #logger.debug("priority job %s was not already in the queue", uuid) + pass + + #----------------------------------------------------------------------------------------------------------------- + def lookForPriorityJobsInDumpStorage(self, databaseCursor, setOfPriorityUuids): + # check for jobs in symlink directories + logger.debug("starting lookForPriorityJobsInDumpStorage") + processorIdSequenceGenerator = None + for uuid in list(setOfPriorityUuids): + logger.debug("looking for %s", uuid) + if self.crashStorePool.crashStorage().uuidInStorage(uuid): + logger.info("priority queuing %s", uuid) + if not processorIdSequenceGenerator: + logger.debug("about to get unbalancedJobScheduler") + processorIdSequenceGenerator = self.unbalancedJobSchedulerIter(databaseCursor) + logger.debug("unbalancedJobScheduler successfully fetched") + processorIdAssignedToThisJob = self.queuePriorityJob(databaseCursor, uuid, processorIdSequenceGenerator) + logger.info("%s assigned to %d", uuid, processorIdAssignedToThisJob) + setOfPriorityUuids.remove(uuid) + databaseCursor.execute("delete from priorityjobs where uuid = %s", (uuid,)) + databaseCursor.connection.commit() + + #----------------------------------------------------------------------------------------------------------------- + def priorityJobsNotFound(self, databaseCursor, setOfPriorityUuids, priorityTableName="priorityjobs"): + # we've failed to find the uuids anywhere + for uuid in setOfPriorityUuids: + self.quitCheck() + logger.error("priority uuid %s was never found", uuid) + databaseCursor.execute("delete from %s where uuid = %s" % (priorityTableName, "%s"), (uuid,)) + databaseCursor.connection.commit() + + #----------------------------------------------------------------------------------------------------------------- + def priorityJobAllocationLoop(self): + logger.info("priorityJobAllocationLoop starting.") + #symLinkIndexPath = os.path.join(self.config.storageRoot, "index") + #deferredSymLinkIndexPath = os.path.join(self.config.deferredStorageRoot, "index") + try: + try: + databaseConnection = None + while (True): + databaseConnection, databaseCursor = self.getDatabaseConnectionPair() + try: + self.quitCheck() + setOfPriorityUuids = self.getPriorityUuids(databaseCursor) + if setOfPriorityUuids: + logger.debug("beginning search for priority jobs") + self.lookForPriorityJobsAlreadyInQueue(databaseCursor, setOfPriorityUuids) + self.lookForPriorityJobsInDumpStorage(databaseCursor, setOfPriorityUuids) + self.priorityJobsNotFound(databaseCursor, setOfPriorityUuids) + except KeyboardInterrupt: + logger.debug("inner detects quit") + raise + except hbc.FatalException: + raise + except: + if databaseConnection is not None: + databaseConnection.rollback() + socorro.lib.util.reportExceptionAndContinue(logger) + self.quitCheck() + logger.debug("sleeping") + self.responsiveSleep(self.priorityLoopDelay) + except hbc.FatalException, x: + logger.debug("somethings gone horribly wrong with HBase") + socorro.lib.util.reportExceptionAndContinue(logger, loggingLevel=logging.CRITICAL) + if databaseConnection is not None: + databaseConnection.rollback() + self.quit = True + except (KeyboardInterrupt, SystemExit): + logger.debug("outer detects quit") + if databaseConnection is not None: + databaseConnection.rollback() + self.quit = True + finally: + logger.info("priorityLoop done.") + + #----------------------------------------------------------------------------------------------------------------- + def jobCleanupLoop (self): + logger.info("jobCleanupLoop starting.") + try: + try: + #logger.info("sleeping first.") + #self.responsiveSleep(self.cleanupJobsLoopDelay) + while True: + logger.info("beginning jobCleanupLoop cycle.") + self.cleanUpCompletedAndFailedJobs() + self.responsiveSleep(self.cleanupJobsLoopDelay) + except (KeyboardInterrupt, SystemExit): + logger.debug("got quit message") + self.quit = True + except: + socorro.lib.util.reportExceptionAndContinue(logger) + finally: + logger.info("jobCleanupLoop done.") + + #----------------------------------------------------------------------------------------------------------------- + def start (self): + priorityJobThread = threading.Thread(name="priorityLoopingThread", target=self.priorityJobAllocationLoop) + priorityJobThread.start() + jobCleanupThread = threading.Thread(name="jobCleanupThread", target=self.jobCleanupLoop) + jobCleanupThread.start() + try: + try: + self.standardJobAllocationLoop() + finally: + logger.debug("waiting to join.") + priorityJobThread.join() + jobCleanupThread.join() + # we're done - kill all the database connections + logger.debug("calling databaseConnectionPool.cleanup().") + self.databaseConnectionPool.cleanup() + self.crashStorePool.cleanup() + except KeyboardInterrupt: + logger.debug("KeyboardInterrupt.") + raise SystemExit + + + diff --git a/socorro/monitor/monitor_app.py b/socorro/monitor/monitor_app.py new file mode 100755 index 0000000000..1c8380e146 --- /dev/null +++ b/socorro/monitor/monitor_app.py @@ -0,0 +1,950 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +"""the monitor_app manages the jobs queue and their processor assignments""" + +import signal +import threading +import time + +from configman import Namespace +from configman.converters import class_converter, timedelta_converter + +from socorro.app.generic_app import App, main +from socorro.lib.datetimeutil import utc_now + +from socorro.external.postgresql.dbapi2_util import ( + single_value_sql, + single_row_sql, + execute_query_fetchall, + execute_no_results, + SQLDidNotReturnSingleValue +) + + +#------------------------------------------------------------------------------ +def timedelta_to_seconds_coverter(td_str): + td = timedelta_converter(td_str) + return td.seconds + td.days * 24 * 3600 + + +#============================================================================== +class NoProcessorsRegisteredError (Exception): + pass + + +#============================================================================== +class MonitorApp(App): + """the MonitorApp class is responsible for gathering new crashes and + assigning them to processors. It implements a queue within the 'jobs' + table in Posgres. This class is multithread hot, creating three thread in + addition to the MainThread: + + standard_job_thread - this thread polls the 'new_crash_source' for new + crashes. New crashes are entered into the It + allocates new crashes to registered + processors in a balanced manner making sure that + no processor is overloaded. + priority_job_thread - this thread polls the 'priorityjobs' table in + postgres for crashes requesting immediate + processing. It assigns jobs in an unbalanced + manner assuming that the processors to which it + assigns jobs will do them immediately without + regard to their queue size. + job_cleanup_thread - this thread simply maintains the internal 'jobs' + table. It deletes completed queue entries. It + looks for stalled jobs and resets. + """ + app_name = 'monitor' + app_version = '3.0' + app_description = __doc__ + + required_config = Namespace() + # configuration is broken into three namespaces: registrar, + # new_crash_source, and job_manager + + #-------------------------------------------------------------------------- + # registrar namespace + # this namespace is for config parameters having to do with registering + # and maintaining the list of processors available + #-------------------------------------------------------------------------- + required_config.namespace('registrar') + required_config.registrar.add_option( + 'database_class', + doc="the class of the registrar's database", + default='socorro.external.postgresql.connection_context.' + 'ConnectionContext', + from_string_converter=class_converter + ) + required_config.registrar.add_option( + 'transaction_executor_class', + default='socorro.database.transaction_executor.TransactionExecutor', + doc="a class that will manage the registrar's transactions", + from_string_converter=class_converter + ) + required_config.registrar.add_option( + 'sweep_frequency', + default='00:02:00', + doc='frequency for cleaning up dead processors', + from_string_converter=timedelta_to_seconds_coverter + ) + required_config.registrar.add_option( + 'processor_grace_period', + default='00:02:00', + doc="a processor is dead if it is this late to renew registration", + from_string_converter=timedelta_converter + ) + required_config.registrar.add_option( + 'check_in_frequency', + doc='how often the processor is required to reregister (hh:mm:ss)', + default="00:01:00", + from_string_converter=timedelta_converter + ) + required_config.registrar.add_option( + 'quit_if_no_processors', + doc='die if there are no live processors running', + default=False, + ) + + #-------------------------------------------------------------------------- + # new_crash_source namespace + # this namespace is for config parameter having to do with the source + # of new crash_ids. This generally for a crashstorage class that + # implements the 'new_crash_ids' iterator + #-------------------------------------------------------------------------- + required_config.namespace('new_crash_source') + required_config.new_crash_source.add_option( + 'new_crash_source_class', + doc='an iterable that will stream crash_ids needing processing', + default='socorro.monitor.crashstore_new_crash_source.' + 'CrashStorageNewCrashSource', + from_string_converter=class_converter + ) + + #-------------------------------------------------------------------------- + # job_manager namespace + # this namespace is for config parameter having to do with maintaining + # the 'jobs' and 'priortityjobs' tables and assigning jobs to + # processors. + #-------------------------------------------------------------------------- + required_config.namespace('job_manager') + required_config.job_manager.add_option( + 'database_class', + doc="the class of the job_manager's database", + default='socorro.external.postgresql.connection_context.' + 'ConnectionContext', + from_string_converter=class_converter + ) + required_config.job_manager.add_option( + 'transaction_executor_class', + default='socorro.database.transaction_executor.TransactionExecutor', + doc="a class that will manage the job_manager's transactions", + from_string_converter=class_converter + ) + required_config.job_manager.add_option( + 'standard_loop_frequency', + default='00:02:00', + doc="the frequency to check for new jobs (hh:mm:ss)", + from_string_converter=timedelta_to_seconds_coverter + ) + required_config.job_manager.add_option( + 'priority_loop_frequency', + default='00:00:30', + doc="the frequency to check for new priority jobs (hh:mm:ss)", + from_string_converter=timedelta_to_seconds_coverter + ) + required_config.job_manager.add_option( + 'job_cleanup_frequency', + default='00:05:00', + doc="the frequency to check for new jobs (hh:mm:ss)", + from_string_converter=timedelta_to_seconds_coverter + ) + + #-------------------------------------------------------------------------- + def __init__(self, config): + super(MonitorApp, self).__init__(config) + self.registrar_database = config.registrar.database_class( + config.registrar + ) + self.registrar_transaction = \ + config.registrar.transaction_executor_class( + config.registrar, + self.registrar_database, + quit_check_callback=self._quit_check + ) + + self.job_manager_database = config.job_manager.database_class( + config.job_manager + ) + self.job_manager_transaction = \ + config.job_manager.transaction_executor_class( + config.job_manager, + self.registrar_database, + quit_check_callback=self._quit_check + ) + + self.new_crash_source = config.new_crash_source.new_crash_source_class( + config.new_crash_source, + '' + ) + + signal.signal(signal.SIGTERM, self._respond_to_SIGTERM) + signal.signal(signal.SIGHUP, self._respond_to_SIGTERM) + + self.quit = False + + #-------------------------------------------------------------------------- + # utilities section + #-------------------------------------------------------------------------- + def _respond_to_SIGTERM(self, signal_number, frame): + """these classes are instrumented to respond to a KeyboardInterrupt by + cleanly shutting down. This function, when given as a handler to for a + SIGTERM event, will make the program respond to a SIGTERM as neatly as + it responds to ^C.""" + signame = 'SIGTERM' + if signal_number != signal.SIGTERM: + signame = 'SIGHUP' + self.config.logger.info("%s detected", signame) + raise KeyboardInterrupt + + #-------------------------------------------------------------------------- + def _quit_check(self): + """this a callback function to be propagated through out the system. + threads should periodically call this function so see if they should + shutdown""" + if self.quit: + self.config.logger.debug('quit signal acknowledged') + raise KeyboardInterrupt + + #-------------------------------------------------------------------------- + def _responsive_sleep(self, seconds): + """the threads spend most of their time sleeping. Even though they're + not doing work, they need to contuously poll the quit function so + that the monitor can shut down promptly on request. This function + sleeps, polling the quit function each second.""" + self.config.logger.info('sleeping for %s seconds', seconds) + for x in xrange(int(seconds)): + self._quit_check() + time.sleep(1.0) + + #-------------------------------------------------------------------------- + def _responsive_join(self, thread): + """similar to the responsive sleep, a join function blocks a thread + until some other thread dies. If it doesn't happen to be the main + thread waitng, it'll need to poll the quit function peroidically to + know if it should quit. + + parameters: + thread - an instance of the TaskThread class representing the + thread to wait for + """ + while True: + try: + thread.join(1.0) + if not thread.isAlive(): + break # no use waiting for a thread that isn't there + self._quit_check() + except KeyboardInterrupt: + self.config.logger.debug('quit detected by _responsive_join') + self.quit = True + + #-------------------------------------------------------------------------- + # job manager section + #-------------------------------------------------------------------------- + def _clean_jobs_table_transaction(self, connection): + """go through the jobs table and remove jobs that are complete""" + self.config.logger.debug("removing completed jobs from queue") + self.config.logger.debug("starting deletion") + execute_no_results( + connection, + "delete from jobs " + "where" + " uuid in (select " + " uuid" + " from" + " jobs j" + " where" + " j.success is not null)" + ) + + #-------------------------------------------------------------------------- + def _kick_stalled_jobs_transaction(self, connection): + """try to restart stalled jobs by changing the startteddatetime to + NULL. This should get the attention of the assigned processor""" + self.config.logger.debug("restart stalled jobs in queue") + execute_no_results( + connection, + "update jobs " + " set starteddatetime = NULL " + "where" + " success is NULL" + " and completeddatetime is NULL" + " and starteddatetime < now() - %s - %s", + (self.config.registrar.check_in_frequency, + self.config.registrar.processor_grace_period) + ) + + #-------------------------------------------------------------------------- + def _job_cleanup_thread(self): + """this is the main rountine for the job_cleanup_thread. Each cycle + does three transactions: the first deletes completed jobs; the second + kicks stalled jobs; the third removes dead processors""" + self.config.logger.info("job_cleanup_loop starting.") + try: + while True: + try: + self.config.logger.info("begin _job_cleanup_thread cycle") + self.job_manager_transaction( + self._clean_jobs_table_transaction + ) + self.job_manager_transaction( + self._kick_stalled_jobs_transaction + ) + self.registrar_transaction( + self._sweep_dead_processors_transaction + ) + self.config.logger.info( + "beginning _job_cleanup_thread cycle." + ) + self.config.logger.info("end _job_cleanup_thread cycle") + self._responsive_sleep( + self.config.job_manager.job_cleanup_frequency + ) + except (KeyboardInterrupt, SystemExit): + #self.config.logger.debug("got quit message") + self.quit = True + break + except Exception: + self.config.logger.warning( + 'unexpected exception', + exc_info=True) + finally: + self.config.logger.info("job_cleanup_loop done.") + + #-------------------------------------------------------------------------- + # processor management section + #-------------------------------------------------------------------------- + def _sweep_dead_processors_transaction(self, connection): + """this function is a single database transaction: look for dead + processors - find all the jobs of dead processors and assign them to + live processors then delete the dead processor registrations""" + self.config.logger.info("looking for dead processors") + try: + self.config.logger.info( + "threshold %s", + self.config.registrar.check_in_frequency + ) + threshold = single_value_sql( + connection, + "select now() - %s - %s", + (self.config.registrar.processor_grace_period, + self.config.registrar.check_in_frequency) + ) + dead_processors = execute_query_fetchall( + connection, + "select id from processors where lastSeenDateTime < %s", + (threshold,) + ) + if dead_processors: + self.config.logger.info("found dead processor(s):") + for a_dead_processor in dead_processors: + self.config.logger.info("%d is dead", a_dead_processor[0]) + + self.config.logger.debug("getting list of live processor(s):") + live_processors = execute_query_fetchall( + connection, + "select id from processors where lastSeenDateTime >= %s", + (threshold,) + ) + if not live_processors: + if self.config.registrar.quit_if_no_processors: + raise NoProcessorsRegisteredError( + "There are no processors registered" + ) + else: + self.config.logger.critical( + 'There are no live processors, nothing to do. ' + 'Waiting for processors to come on line.' + ) + return + number_of_live_processors = len(live_processors) + + self.config.logger.debug( + "getting range of queued date for jobs associated with " + "dead processor(s):" + ) + dead_processor_ids_str = ", ".join( + [str(x[0]) for x in dead_processors] + ) + earliest_dead_job, latest_dead_job = single_row_sql( + connection, + "select min(queueddatetime), max(queueddatetime) from jobs " + "where owner in (%s)" % dead_processor_ids_str + ) + # take dead processor jobs and reallocate them to live + # processors in equal sized chunks + if (earliest_dead_job is not None and + latest_dead_job is not None): + time_increment = ( + (latest_dead_job - earliest_dead_job) / + number_of_live_processors + ) + for x, live_processor_id in enumerate(live_processors): + low_queued_time = ( + x * time_increment + earliest_dead_job + ) + high_queued_time = ( + (x + 1) * time_increment + earliest_dead_job + ) + self.config.logger.info( + "assigning jobs from %s to %s to processor %s:", + low_queued_time, + high_queued_time, + live_processor_id + ) + # why is the range >= at both ends? the range must be + # inclusive, the risk of moving a job twice is low and + # consequences low, too. + # 1st step: take any jobs of a dead processor that were + # in progress and reset them to unprocessed + execute_no_results( + connection, + "update jobs set" + " starteddatetime = NULL " + "where" + " %%s >= queueddatetime" + " and queueddatetime >= %%s" + " and owner in (%s)" + " and success is NULL" % dead_processor_ids_str, + (high_queued_time, low_queued_time) + ) + # 2nd step: take all jobs of a dead processor and give + # them to a new owner + execute_no_results( + connection, + "update jobs set" + " owner = %%s " + "where" + " %%s >= queueddatetime" + " and queueddatetime >= %%s" + " and owner in (%s)" % dead_processor_ids_str, + (live_processor_id, high_queued_time, + low_queued_time) + ) + + # transfer stalled priority jobs to new processors + for dead_processor_tuple in dead_processors: + self.config.logger.info( + "re-assigning priority jobs from processor %d:", + dead_processor_tuple[0] + ) + savepoint = "drop_priority_jobs_%d" % dead_processor_tuple[0] + execute_no_results(connection, "savepoint %s" % savepoint) + try: + execute_no_results( + connection, + "insert into priorityjobs (uuid) select uuid " + "from priority_jobs_%d" % dead_processor_tuple + ) + except Exception as e: + self.config.logger.warn("attempt to move priority jobs " + "for processor %d failed: %s", + dead_processor_tuple[0], e) + execute_no_results(connection, + "rollback to savepoint %s" % + savepoint) + else: + execute_no_results(connection, + "release savepoint %s" % + savepoint) + + self.config.logger.info("removing all dead processors") + execute_no_results( + connection, + "delete from processors where lastSeenDateTime < %s", + (threshold,) + ) + # remove dead processors' priority tables + for a_dead_processor in dead_processors: + execute_no_results( + connection, + "drop table if exists priority_jobs_%d" % + a_dead_processor[0] + ) + except NoProcessorsRegisteredError: + self.quit = True + self.config.logger.critical('there are no live processors') + + #-------------------------------------------------------------------------- + def _get_processors_and_loads_transaction(self, connection): + """this transaction fetches a list of live processors and how many + jobs each curretly has assigned to it""" + sql = ("with live_processors as " + " (select * from processors where " + " lastSeenDateTime > now() - %s)" + "select" + " p.id," + " count(j.owner)," + " p.name " + "from" + " live_processors p left join jobs j " + " on p.id = j.owner" + " and j.success is null " + "group by p.id, p.name") + processors_and_load = execute_query_fetchall( + connection, + sql, + (self.config.registrar.check_in_frequency,) + ) + # convert row tuples to muteable lists + return [[a_row[0], a_row[1], a_row[2]] + for a_row in processors_and_load] + + #-------------------------------------------------------------------------- + def _balanced_processor_iter(self): + """ This takes a snap shot of the state of the processors as well as + the number of jobs assigned to each then acts as an iterator that + returns a sequence of processor ids. Order of ids returned will assure + that jobs are assigned in a balanced manner. + + This iterator is infinite. It never raises StopIteration. How does + it ever quit? It is run in parallel with the iterator that fetches + a batch of crash_ids from the crash_id source by the + '_standard_job_thread' method. When that iterator is exhausted, this + iterator is thrown away. On the next batch of crash_ids, a new copy of + this iterator is created.""" + self.config.logger.debug( + "balanced _balanced_processor_iter: compiling list of active " + "processors" + ) + try: + list_of_processors_and_loads = self.job_manager_transaction( + self._get_processors_and_loads_transaction + ) + self.config.logger.debug( + "list_of_processors_and_loads: %s", + str(list_of_processors_and_loads) + ) + if not list_of_processors_and_loads: + if self.config.registrar.quit_if_no_processors: + raise NoProcessorsRegisteredError( + "There are no processors registered" + ) + else: + self.config.logger.critical( + "There are no live processors. " + "Waiting for processors to come on line" + ) + yield None + while True: + #self.config.logger.debug( + #"sort the list of (processorId, numberOfAssignedJobs) pairs" + #) + list_of_processors_and_loads.sort(lambda x, y: cmp(x[1], y[1])) + # the processor with the fewest jobs is about to be assigned a + # new job, so increment its count + list_of_processors_and_loads[0][1] += 1 + #self.config.logger.debug( + #"yield the processorId which had the fewest jobs: %d", + #list_of_processors_and_loads[0][0] + #) + yield (list_of_processors_and_loads[0][0], + list_of_processors_and_loads[0][2]) + except NoProcessorsRegisteredError: + self.quit = True + self.config.logger.critical('there are no live processors') + raise + + #-------------------------------------------------------------------------- + def _get_live_processors_transaction(self, connection): + """this transaction just fetches a list of live processors""" + processor_ids = execute_query_fetchall( + connection, + "select id, name from processors " + "where lastSeenDateTime > now() - interval %s", + (self.config.registrar.check_in_frequency,) + ) + # remove the row tuples, just give out a pure list of ids + return [(a_row[0], a_row[1]) for a_row in processor_ids] + + #-------------------------------------------------------------------------- + def _unbalanced_processor_iter(self): + """ This generator returns a sequence of active processorId without + regard to job balance. Like its brother, '_balanced_processor_iter', + it is an infinite iter, never raising 'StopIteration'.""" + self.config.logger.debug( + "_unbalanced_processor_iter: compiling list of active processors" + ) + try: + while True: + list_of_processor_ids = self.job_manager_transaction( + self._get_live_processors_transaction + ) + if not list_of_processor_ids: + if self.config.registrar.quit_if_no_processors: + raise NoProcessorsRegisteredError( + "There are no processors registered" + ) + else: + self.config.logger.critical( + "There are no live processors. " + "Waiting for processors to come on line" + ) + yield None + for a_processor_id, a_processor_name in list_of_processor_ids: + #self.config.logger.debug( + #'about to yield %s(%d)', + #a_processor_name, + #a_processor_id + #) + yield a_processor_id, a_processor_name + except NoProcessorsRegisteredError: + self.quit = True + self.config.logger.critical('there are no live processors') + raise + + #-------------------------------------------------------------------------- + # job queuing section + #-------------------------------------------------------------------------- + def _queue_standard_job_transaction(self, connection, crash_id, + candidate_processor_iter): + """this method implements a single transaction, inserting a crash into + the 'jobs' table. Because the jobs table contains a non-NULL foreign + key reference to the 'processors' table, the act of insertion is also + the act of assigning the crash to a processor.""" + #self.config.logger.debug("trying to insert %s", crash_id) + processor_id, processor_name = candidate_processor_iter.next() + if processor_id is None: + return None + execute_no_results( + connection, + "insert into jobs (pathname, uuid, owner, priority," + " queuedDateTime) " + "values (%s, %s, %s, %s, %s)", + ('', crash_id, processor_id, 1, utc_now()) + ) + self.config.logger.info( + "%s assigned to processor %s (%d)", + crash_id, + processor_name, + processor_id + ) + return processor_id + + #-------------------------------------------------------------------------- + def _queue_priorty_job_transaction(self, connection, crash_id, + candidate_processor_iter): + """this method implements a transaction, inserting a crash to both + the 'jobs' table (via the '_queue_standard_job_transaction' method) + and the 'priority_jobs_XXX' table associated with the target + processor""" + assigned_processor = self._queue_standard_job_transaction( + connection, + crash_id, + candidate_processor_iter + ) + if assigned_processor is None: + return None + execute_no_results( + connection, + "insert into priority_jobs_%d (uuid) values (%%s)" + % assigned_processor, + (crash_id,) + ) + execute_no_results( + connection, + "delete from priorityjobs where uuid = %s", + (crash_id,) + ) + return assigned_processor + + #-------------------------------------------------------------------------- + def _standard_job_thread(self): + """This is the main method for the 'standard_job_thread'. It is + responsible for iterating through the 'new_crash_source' for new + crashes, and assigning them to processors. + """ + try: + self.config.logger.info("starting _standard_job_thread") + while (True): + self.config.logger.info("begin _standard_job_thread cycle") + self._quit_check() + # walk the dump indexes and assign jobs + self.config.logger.debug("getting _balanced_processor_iter") + processor_iter = self._balanced_processor_iter() + self.config.logger.debug("scanning for new crashes") + for crash_id in self.new_crash_source(): + try: + #self.config.logger.debug("new job: %s", crash_id) + while True: + # retry until we succeed in assigning + self._quit_check() + assigned_processor = \ + self.job_manager_transaction( + self._queue_standard_job_transaction, + crash_id, + processor_iter + ) + if assigned_processor is not None: + break + self.config.logger.warning( + 'sleeping for %s, and then trying again', + 60 + ) + self._responsive_sleep(60) + processor_iter = self._balanced_processor_iter() + # if the monitor starts misbehaving and not quitting after + # a SIGTERM or ^C, uncomment the following two line. It + # will help diagnose the problem. + #except KeyboardInterrupt: + #self.config.logger.debug("inner detects quit") + #self.quit = True + #raise + except Exception: + self.config.logger.error( + 'Unexpected exception while assigning jobs ' + 'to processors', + exc_info=True + ) + self.config.logger.info("end _standard_job_thread cycle") + self._responsive_sleep( + self.config.job_manager.standard_loop_frequency + ) + except Exception: + self.config.logger.critical( + 'something is seriously wrong', + exc_info=True + ) + self.quit = True + raise + except (KeyboardInterrupt, SystemExit): + #self.config.logger.debug("outer detects quit") + self.quit = True + finally: + self.config.logger.debug("_standard_job_thread done.") + + #-------------------------------------------------------------------------- + def _get_priority_jobs_transaction(self, connection): + """this method implements a single transaction that just returns a + set of priority jobs.""" + priority_jobs_list = execute_query_fetchall( + connection, + "select * from priorityjobs" + ) + return set(x[0] for x in priority_jobs_list) + + #-------------------------------------------------------------------------- + def _prioritize_previously_enqueued_jobs_transaction(self, connection, + crash_id): + """priorty jobs come into the system at random times. A given crash_id + may already be queued for processing when a priority request comes in + for it. To avoid repeating processing, a priority crash_id is checked + to see if it is already queued. If it is, the processor already + assigned to it is told to expedite processing. This done just by + entering the crash_id into the processors private 'priority_jobs_XXX' + table.""" + try: + job_owner = single_value_sql( + connection, + "select owner from jobs where uuid = %s", + (crash_id,) + ) + except SQLDidNotReturnSingleValue: + return False + priority_job_table_name = 'priority_jobs_%d' % job_owner + self.config.logger.debug( + "priority job %s was already in the queue, assigned to %d", + crash_id, + job_owner + ) + try: + # detect if the found job was assigned to a processor that was + # dead by checking to see if the priority jobs table exists or + # not. If id doesn't exist, wait for the job to get reassigned + # to a live processor. It in the future, it may be better to + # just reassign the job immediately. + single_value_sql( # return value intentionally ignored + connection, + "select 1 from pg_stat_user_tables where relname = %s", + (priority_job_table_name,) + ) + except SQLDidNotReturnSingleValue: + self.config.logger.debug( + "%s assigned to dead processor %d - " + "wait for reassignment", + crash_id, + job_owner + ) + # likely that the job is assigned to a dead processor + # skip processing it this time around - by next time + # hopefully it will have been + # re assigned to a live processor + return False + execute_no_results( + connection, + "insert into %s (uuid) values (%%s)" % + priority_job_table_name, + (crash_id,) + ) + execute_no_results( + connection, + "delete from priorityjobs where uuid = %s", + (crash_id,) + ) + return True + + #-------------------------------------------------------------------------- + def _prioritize_previously_enqueued_jobs(self, priority_jobs_set): + """this method checks to see if any priorty jobs are already queued + for processing. If so, a transaction is executed that will expedite + processing.""" + # check for crash_ids already in the queue + for crash_id in list(priority_jobs_set): # must use list copy - + # the set gets changed + self._quit_check() + success = self.job_manager_transaction( + self._prioritize_previously_enqueued_jobs_transaction, + crash_id + ) + if success: + priority_jobs_set.remove(crash_id) + + #-------------------------------------------------------------------------- + def _prioritize_unqueued_jobs(self, priority_jobs_set): + """this method takes priority jobs that where not already queued + and queues them.""" + self.config.logger.debug("starting prioritize_unqueued_jobs") + processor_iter = None + for crash_id in list(priority_jobs_set): # must use list copy - + # the set gets changed + self.config.logger.debug("looking for %s", crash_id) + while True: + self.config.logger.info("priority queuing %s", crash_id) + if not processor_iter: + self.config.logger.debug( + "about to get unbalanced_processor_iter" + ) + processor_iter = self._unbalanced_processor_iter() + self.config.logger.debug( + "unbalancedJobScheduler successfully fetched" + ) + assigned_processor = self.job_manager_transaction( + self._queue_priorty_job_transaction, + crash_id, + processor_iter + ) + if assigned_processor is None: + self.config.logger.critical( + "can't seem to assign this job to a processor, are " + "processors running?" + ) + self._responsive_sleep(10) + continue + #self.config.logger.debug( + #"%s assigned to %d", + #crash_id, + #assigned_processor + #) + self.job_manager_transaction( + execute_no_results, + "delete from priorityjobs where uuid = %s", + (crash_id,) + ) + priority_jobs_set.remove(crash_id) + break + + ##------------------------------------------------------------------------- + #def remove_missing_priority_jobs(self, priority_jobs_set): + ## we've failed to find the crash_ids anywhere + #for crash_id in priority_jobs_set: + #self.quit_check() + #self.config.logger.warning( + #"priority job %s was never found", + #crash_id + #) + #self.job_manager_transaction( + #execute_no_results, + #"delete from priorityjobs where uuid = %s", + #(crash_id,) + #) + + #-------------------------------------------------------------------------- + def _priority_job_thread(self): + """this method is the main function for the 'priority_job_thread'. It + periodically polls the 'priorityjobs' table for priority crash_ids. + Each crash_id is first checked to see if it already enqueued. If not, + it queues them.""" + self.config.logger.info("start _priority_job_thread") + try: + while (True): + self.config.logger.info("begin _priority_job_thread cycle") + try: + self._quit_check() + priority_jobs_set = self.job_manager_transaction( + self._get_priority_jobs_transaction + ) + if priority_jobs_set: + self.config.logger.debug( + "beginning search for priority jobs" + ) + self._prioritize_previously_enqueued_jobs( + priority_jobs_set + ) + self._prioritize_unqueued_jobs(priority_jobs_set) + #self.remove_missing_priority_jobs(priority_jobs_set) + #except KeyboardInterrupt: + #self.config.logger.debug("inner detects quit") + #raise + except Exception: + self.config.logger.error( + "Unexpected exception", + exc_info=True + ) + self.config.logger.info("end _priority_job_thread cycle") + self._responsive_sleep( + self.config.job_manager.priority_loop_frequency + ) + except (KeyboardInterrupt, SystemExit): + #self.config.logger.debug("outer detects quit") + self.quit = True + except Exception: + self.config.logger.critical( + "something's gone horribly wrong", + exc_info=True + ) + self.quit = True + finally: + self.config.logger.info("priorityLoop done.") + + #-------------------------------------------------------------------------- + def main(self): + """this function is run by the main thread. It just starts the + subordinate threads and then waits for them to complete.""" + standard_job_thread = threading.Thread( + name="standard_job_thread", + target=self._standard_job_thread + ) + standard_job_thread.start() + + priority_job_thread = threading.Thread( + name="priority_job_thread", + target=self._priority_job_thread + ) + priority_job_thread.start() + + job_cleanup_thread = threading.Thread( + name="job_cleanup_thread", + target=self._job_cleanup_thread + ) + job_cleanup_thread.start() + + self.config.logger.debug("waiting to join.") + self._responsive_join(job_cleanup_thread) + self._responsive_join(priority_job_thread) + self._responsive_join(standard_job_thread) + + +if __name__ == '__main__': + main(MonitorApp) diff --git a/socorro/storage/__init__.py b/socorro/storage/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/socorro/storage/crashstorage.py b/socorro/storage/crashstorage.py new file mode 100644 index 0000000000..0e12527d4e --- /dev/null +++ b/socorro/storage/crashstorage.py @@ -0,0 +1,332 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Temporary adaptors from old code to new code + +import re +import random +import threading + +import socorro.lib.ver_tools as vtl + +from socorro.external.crashstorage_base import ( + FallbackCrashStorage, + CrashStorageBase, +) +from socorro.external.filesystem.crashstorage import FileSystemRawCrashStorage +from socorro.external.hbase.crashstorage import HBaseCrashStorage +from socorro.external.hbase.connection_context import \ + HBaseConnectionContextPooled +from socorro.database.transaction_executor import \ + TransactionExecutorWithLimitedBackoff + +from configman.dotdict import DotDict + +compiledRegularExpressionType = type(re.compile('')) +functionType = type(lambda x: x) + +pattern_str = r'(\d+)\.(\d+)\.?(\d+)?\.?(\d+)?([a|b]?)(\d*)(pre)?(\d)?' +pattern = re.compile(pattern_str) + +pattern_plus = re.compile(r'((\d+)\+)') + + +#============================================================================== +class CrashStorageSystem(object): + #-------------------------------------------------------------------------- + def __init__ (self, config): + self.config = config + + #-------------------------------------------------------------------------- + NO_ACTION = 0 + OK = 1 + DISCARDED = 2 + ERROR = 3 + RETRY = 4 + + #-------------------------------------------------------------------------- + def close (self): + pass + + #-------------------------------------------------------------------------- + def save_raw (self, uuid, jsonData, dumps): + try: + self.crash_storage.save_raw_crash(jsonData, dumps, uuid) + return CrashStorageSystem.OK + except Exception: + return CrashStorageSystem.ERROR + + #-------------------------------------------------------------------------- + def save_processed (self, uuid, jsonData): + try: + if 'uuid' not in jsonData: + jsonData['uuid'] = uuid + self.crash_storage.save_processed(jsonData) + return CrashStorageSystem.OK + except Exception: + return CrashStorageSystem.ERROR + + #-------------------------------------------------------------------------- + def get_meta (self, uuid): + return self.crash_storage.get_raw_crash(uuid) + + #-------------------------------------------------------------------------- + def get_raw_dump (self, uuid, name=None): + return self.crash_storage.get_raw_dump(uuid, name) + + #-------------------------------------------------------------------------- + def get_raw_dumps (self, uuid): + return self.crash_storage.get_raw_dumps(uuid) + + #-------------------------------------------------------------------------- + def get_processed (self, uuid): + return self.crash_storage.get_processed(uuid) + + #-------------------------------------------------------------------------- + def remove (self, uuid): + try: + self.crash_storage.remove(uuid) + return CrashStorageSystem.OK + except Exception: + return CrashStorageSystem.ERROR + + #-------------------------------------------------------------------------- + def quickDelete (self, uuid): + return self.remove(uuid) + + #-------------------------------------------------------------------------- + def uuidInStorage (self, uuid): + try: + self.crash_storage.get_raw_crash(uuid) + return True + except Exception: + return False + + #-------------------------------------------------------------------------- + def newUuids(self): + for a_crash in self.crash_storage.new_crashes(): + yield a_crash + + +#============================================================================== +class CrashStorageSystemForLocalFS(CrashStorageSystem): + def __init__(self, config, quit_check=None): + super(CrashStorageSystemForLocalFS, self).__init__(config) + + # new_config is an adapter to allow the modern configman enabled + # file system crash storage classes to use the old style configuration. + new_config = DotDict() + new_config.logger = config.logger + + new_config.primary = DotDict() + new_config.primary.storage_class = FileSystemRawCrashStorage + new_config.primary.std_fs_root = config.localFS + new_config.primary.dump_dir_count = config.localFSDumpDirCount + new_config.primary.dump_gid = config.localFSDumpGID + new_config.primary.dump_permissions = config.localFSDumpPermissions + new_config.primary.dir_permissions = config.localFSDirPermissions + new_config.primary.json_file_suffix = config.jsonFileSuffix + new_config.primary.dump_file_suffix = config.dumpFileSuffix + new_config.primary.logger = config.logger + + new_config.fallback = DotDict() + new_config.fallback.storage_class = FileSystemRawCrashStorage + new_config.fallback.std_fs_root = config.fallbackFS + new_config.fallback.dump_dir_count = config.fallbackDumpDirCount + new_config.fallback.dump_gid = config.fallbackDumpGID + new_config.fallback.dump_permissions = config.fallbackDumpPermissions + new_config.fallback.dir_permissions = config.fallbackDirPermissions + new_config.fallback.json_file_suffix = config.jsonFileSuffix + new_config.fallback.dump_file_suffix = config.dumpFileSuffix + new_config.fallback.logger = config.logger + + self.crash_storage = FallbackCrashStorage(new_config, quit_check) + + +#============================================================================== +class CrashStorageSystemForHBase(CrashStorageSystem): + def __init__(self, config, quit_check=None): + super(CrashStorageSystemForHBase, self).__init__(config) + # new_config is an adapter to allow the modern configman enabled + # file system crash storage classes to use the old style configuration. + new_config = DotDict() + new_config.logger = config.logger + new_config.hbase_connection_pool_class = HBaseConnectionContextPooled + new_config.number_of_retries = 2 + new_config.hbase_host = config.hbaseHost + new_config.hbase_port = config.hbasePort + new_config.hbase_timeout = config.hbaseTimeout + new_config.forbidden_keys = ['email', 'url', 'user_id', + 'exploitability'] + new_config.transaction_executor_class = \ + TransactionExecutorWithLimitedBackoff + new_config.backoff_delays = [10, 30, 60, 120, 300] + new_config.wait_log_interval = 5 + + self.crash_storage = HBaseCrashStorage(new_config, quit_check) + + +#============================================================================== +class LegacyThrottler(object): + #-------------------------------------------------------------------------- + def __init__(self, config): + self.config = config + self.processedThrottleConditions = \ + self.preprocessThrottleConditions(config.throttleConditions) + #-------------------------------------------------------------------------- + ACCEPT = 0 + DEFER = 1 + DISCARD = 2 + IGNORE = 3 + + #-------------------------------------------------------------------------- + @staticmethod + def regexpHandlerFactory(regexp): + def egexpHandler(x): + return regexp.search(x) + return egexpHandler + + #-------------------------------------------------------------------------- + @staticmethod + def boolHandlerFactory(aBool): + def boolHandler(dummy): + return aBool + return boolHandler + + #-------------------------------------------------------------------------- + @staticmethod + def genericHandlerFactory(anObject): + def genericHandler(x): + return anObject == x + return genericHandler + + #-------------------------------------------------------------------------- + def preprocessThrottleConditions(self, originalThrottleConditions): + newThrottleConditions = [] + for key, condition, percentage in originalThrottleConditions: + #print "preprocessing %s %s %d" % (key, condition, percentage) + conditionType = type(condition) + if conditionType == compiledRegularExpressionType: + #print "reg exp" + newCondition = LegacyThrottler.regexpHandlerFactory(condition) + #print newCondition + elif conditionType == bool: + #print "bool" + newCondition = LegacyThrottler.boolHandlerFactory(condition) + #print newCondition + elif conditionType == functionType: + newCondition = condition + else: + newCondition = LegacyThrottler.genericHandlerFactory(condition) + newThrottleConditions.append((key, newCondition, percentage)) + return newThrottleConditions + + #-------------------------------------------------------------------------- + def understandsRefusal(self, raw_crash): + try: + return (vtl.normalize(raw_crash['Version']) >= vtl.normalize( + self.config.minimalVersionForUnderstandingRefusal[ + raw_crash['ProductName'] + ]) + ) + except KeyError: + return False + + #-------------------------------------------------------------------------- + def applyThrottleConditions(self, raw_crash): + """cycle through the throttle conditions until one matches or we fall + off the end of the list. + returns: + True - reject + False - accept + None - totally ignore this crash + """ + #print processedThrottleConditions + for key, condition, percentage in self.processedThrottleConditions: + throttleMatch = False + try: + if key == '*': + throttleMatch = condition(raw_crash) + else: + throttleMatch = condition(raw_crash[key]) + except KeyError: + if key == None: + throttleMatch = condition(None) + else: + #this key is not present in the jsonData - skip + continue + except IndexError: + pass + if throttleMatch: # condition match, apply the throttle percentage + if percentage is None: + return None + randomRealPercent = random.random() * 100.0 + return randomRealPercent > percentage + # nothing matched, reject + return True + + #-------------------------------------------------------------------------- + def throttle(self, raw_crash): + result = self.applyThrottleConditions(raw_crash) + if result is None: + self.config.logger.debug("ignoring %s %s", raw_crash.ProductName, + raw_crash.Version) + return LegacyThrottler.IGNORE + if result: + #self.config.logger.debug('yes, throttle this one') + if (self.understandsRefusal(raw_crash) and + not self.config.neverDiscard): + self.config.logger.debug("discarding %s %s", + raw_crash.ProductName, + raw_crash.Version) + return LegacyThrottler.DISCARD + else: + self.config.logger.debug("deferring %s %s", + raw_crash.ProductName, + raw_crash.Version) + return LegacyThrottler.DEFER + else: + self.config.logger.debug("not throttled %s %s", + raw_crash.ProductName, + raw_crash.Version) + return LegacyThrottler.ACCEPT + + +#============================================================================== +class CrashStoragePool(dict): + #-------------------------------------------------------------------------- + def __init__(self, config, storageClass=CrashStorageSystemForHBase, + quit_check=None): + super(CrashStoragePool, self).__init__() + self.config = config + self.logger = config.logger + self.storageClass = storageClass + self.quit_check_fn = quit_check + self.logger.debug("creating crashStorePool") + + #-------------------------------------------------------------------------- + def crashStorage(self, name=None): + """Like connecionCursorPairNoTest, but test that the specified + connection actually works""" + if name is None: + name = threading.currentThread().getName() + if name not in self: + self.logger.debug("creating crashStore for %s", name) + self[name] = self.storageClass(self.config, self.quit_check_fn) + return self[name] + + #-------------------------------------------------------------------------- + def cleanup(self): + for name, crashStore in self.iteritems(): + try: + crashStore.close() + self.logger.debug("crashStore for %s closed", name) + except Exception: + self.logger.warning('could not close %s', name, + exc_info=True) + + #-------------------------------------------------------------------------- + def remove(self, name): + self[name].close() + del self[name] diff --git a/socorro/unittest/config/commonconfig.py.dist b/socorro/unittest/config/commonconfig.py.dist new file mode 100644 index 0000000000..30870fe30d --- /dev/null +++ b/socorro/unittest/config/commonconfig.py.dist @@ -0,0 +1,172 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +## +## Copy this file to commonconfig.py +## - then fix the values to match your test environment +## + +#--------------------------------------------------------------------------- +# Relational Database Section + +import socorro.lib.ConfigurationManager as cm +databaseHost = cm.Option() +databaseHost.doc = 'the hostname of the database servers' +databaseHost.default = 'localhost' + +databaseName = cm.Option() +databaseName.doc = 'the name of the database within the server' +databaseName.default = 'socorro_integration_test' + +oldDatabaseName = cm.Option() +oldDatabaseName.doc = 'the name of the old, deprecated test database within the server' +oldDatabaseName.default = 'socorro_test' + +databaseUserName = cm.Option() +databaseUserName.doc = 'the user name for the database servers' +databaseUserName.default = 'test' + +databasePassword = cm.Option() +databasePassword.doc = 'the password for the database user' +databasePassword.default = 'aPassword' + +databaseSuperUserName = cm.Option() +databaseSuperUserName.doc = 'the super user name for the database servers' +databaseSuperUserName.default = 'test' + +databaseSuperUserPassword = cm.Option() +databaseSuperUserPassword.doc = 'the super user password for the database user' +databaseSuperUserPassword.default = 'aPassword' + +#--------------------------------------------------------------------------- +# RabbitMQ config + +rabbitMQUsername = cm.Option() +rabbitMQUsername.doc = 'the username of the rabbitmq user' +rabbitMQUsername.default = 'guest' + +rabbitMQPassword = cm.Option() +rabbitMQPassword.doc = 'the password of the rabbitmq user' +rabbitMQPassword.default = 'guest' + +rabbitMQHost = cm.Option() +rabbitMQHost.doc = 'the hostname of the rabbitmq service' +rabbitMQHost.default = 'localhost' + +rabbitMQPort = cm.Option() +rabbitMQPort.doc = 'the port of the rabbitmq service' +rabbitMQPort.default = 5672 + +rabbitMQVirtualhost = cm.Option() +rabbitMQVirtualhost.doc = 'the virtual host for rabbitmq' +rabbitMQVirtualhost.default = '/' + + +rabbitMQStandardQueue = cm.Option() +rabbitMQStandardQueue.doc = 'the standard queue for rabbitmq' +rabbitMQStandardQueue.default = 'socorro.normal' + +rabbitMQPriorityQueue = cm.Option() +rabbitMQPriorityQueue.doc = 'the standard queue for rabbitmq' +rabbitMQPriorityQueue.default = 'socorro.priority' + +#--------------------------------------------------------------------------- +# HBase storage system + +hbaseHost = cm.Option() +hbaseHost.doc = 'Hostname for hbase hadoop cluster. May be a VIP or load balancer' +hbaseHost.default = 'localhost' + +hbasePort = cm.Option() +hbasePort.doc = 'hbase port number' +hbasePort.default = 9090 + +hbaseTimeout = cm.Option() +hbaseTimeout.doc = 'timeout in milliseconds for an HBase connection' +hbaseTimeout.default = 5000 + +#--------------------------------------------------------------------------- +# elasticsearch storage system + +searchMaxNumberOfDistinctSignatures = cm.Option() +searchMaxNumberOfDistinctSignatures.doc = ( + "Integer containing the maximum allowed number of distinct signatures " + "the system should retrieve. " + "Used mainly for performances in ElasticSearch") +searchMaxNumberOfDistinctSignatures.default = 1000 + +elasticSearchHostname = cm.Option() +elasticSearchHostname.doc = 'String containing the URI of the Elastic Search instance.' +elasticSearchHostname.default = 'localhost' + +elasticSearchPort = cm.Option() +elasticSearchPort.doc = 'String containing the port on which calling the Elastic Search instance.' +elasticSearchPort.default = '9200' + +elasticsearch_urls = cm.Option() +elasticsearch_urls.doc = 'the urls to the elasticsearch instances' +elasticsearch_urls.default = 'http://localhost:9200' + +elasticsearch_index = cm.Option() +elasticsearch_index.doc = ( + 'an index format to pull crashes from elasticsearch ' + "(use datetime's strftime format to have daily, weekly or monthly indexes)" +) +elasticsearch_index.default = 'socorro_integration_test' + +elasticsearch_doctype = cm.Option() +elasticsearch_doctype.doc = 'the default doctype to use in elasticsearch' +elasticsearch_doctype.default = 'crash_reports' + +elasticsearch_timeout = cm.Option() +elasticsearch_timeout.doc = 'the time in seconds before a query to elasticsearch fails' +elasticsearch_timeout.default = 30 + +facets_max_number = cm.Option() +facets_max_number.doc = 'the maximum number of results a facet will return in search' +facets_max_number.default = 50 + +#--------------------------------------------------------------------------- +# statsd config + +statsdHost = cm.Option() +statsdHost.doc = '' +statsdHost.default = '' + +statsdPort = cm.Option() +statsdPort.doc = '' +statsdPort.default = 8125 + +statsdPrefix = cm.Option() +statsdPrefix.doc = '' +statsdPrefix.default = '' + +#--------------------------------------------------------------------------- +# Platforms +platforms = cm.Option() +platforms.doc = 'Array associating OS ids to full names.' +platforms.default = ( + { + "id" : "windows", + "name" : "Windows NT" + }, + { + "id" : "mac", + "name" : "Mac OS X" + }, + { + "id" : "linux", + "name" : "Linux" + }, +) + +#--------------------------------------------------------------------------- +# Release Channels +non_release_channels = cm.Option() +non_release_channels.doc = 'List of channels, excluding the `release` one.' +non_release_channels.default = ['beta', 'aurora', 'nightly'] + +restricted_channels = cm.Option() +restricted_channels.doc = 'List of channels to restrict based on build ids.' +restricted_channels.default = ['beta'] diff --git a/socorro/unittest/cron/cronTestconfig.py b/socorro/unittest/cron/cronTestconfig.py new file mode 100644 index 0000000000..1b35b4ed96 --- /dev/null +++ b/socorro/unittest/cron/cronTestconfig.py @@ -0,0 +1,25 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +## Test config file for testMtbf +import socorro.lib.ConfigurationManager as cm +import datetime + +from socorro.unittest.config.commonconfig import databaseHost +try: + from socorro.unittest.config.commonconfig import databasePort +except: + databasePort = 5432 +from socorro.unittest.config.commonconfig import oldDatabaseName as databaseName +from socorro.unittest.config.commonconfig import databaseUserName +from socorro.unittest.config.commonconfig import databasePassword + +# processingDay = cm.Option() +# processingDay.doc = 'Day to process in (YYYY-MM-DD) format' +# processingDay.default = (datetime.date.today() - datetime.timedelta(1)).isoformat() # yesterday +# processingDay.singleCharacter = 'd' + +logFilePathname = cm.Option() +logFilePathname.doc = 'full pathname for the log file' +logFilePathname.default = '%(testDir)s/logs/cron_test.log' diff --git a/socorro/unittest/cron/jobs/test_daily_url.py b/socorro/unittest/cron/jobs/test_daily_url.py new file mode 100644 index 0000000000..ac5e02575b --- /dev/null +++ b/socorro/unittest/cron/jobs/test_daily_url.py @@ -0,0 +1,328 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import datetime +import gzip +import os +from subprocess import PIPE +import mock +from nose.tools import eq_, ok_ +from crontabber.app import CronTabber +from socorro.unittest.cron.setup_configman import ( + get_config_manager_for_crontabber, +) +from socorro.unittest.cron.jobs.base import IntegrationTestBase + + +#============================================================================== +class IntegrationTestDailyURL(IntegrationTestBase): + + def setUp(self): + super(IntegrationTestDailyURL, self).setUp() + self.Popen_patcher = mock.patch('subprocess.Popen') + self.Popen = self.Popen_patcher.start() + + def tearDown(self): + self.conn.cursor().execute(""" + TRUNCATE TABLE reports CASCADE; + TRUNCATE TABLE bugs CASCADE; + TRUNCATE TABLE bug_associations CASCADE; + """) + self.conn.commit() + self.Popen_patcher.stop() + super(IntegrationTestDailyURL, self).tearDown() + + def _setup_config_manager(self, product='WaterWolf', + output_path=None, + public_output_path=None, + **kwargs + #version=None, + #private_user='ted', + #private_server='secure.mozilla.org', + #private_location='/var/logs/', + #public_user='bill', + #public_server='ftp.mozilla.org', + #public_location='/tmp/%Y%m%d/', + ): + if output_path is None: + output_path = self.tempdir + if public_output_path is None: + public_output_path = self.tempdir + extra_value_source = { + 'crontabber.class-DailyURLCronApp.output_path': output_path, + 'crontabber.class-DailyURLCronApp.public_output_path': public_output_path, + 'crontabber.class-DailyURLCronApp.product': product, + } + for key, value in kwargs.items(): + extra_value_source['crontabber.class-DailyURLCronApp.%s' % key] = value + + return get_config_manager_for_crontabber( + jobs='socorro.cron.jobs.daily_url.DailyURLCronApp|1d', + overrides=extra_value_source + ) + + def test_basic_run_job_no_data(self): + config_manager = self._setup_config_manager() + + with config_manager.context() as config: + tab = CronTabber(config) + tab.run_all() + + information = self._load_structure() + assert information['daily-url'] + assert not information['daily-url']['last_error'] + assert information['daily-url']['last_success'] + + # this should have created two .csv.gz files + now = datetime.datetime.utcnow() - datetime.timedelta(days=1) + + private = now.strftime('%Y%m%d-crashdata.csv.gz') + public = now.strftime('%Y%m%d-pub-crashdata.csv.gz') + ok_(private in os.listdir(self.tempdir)) + ok_(public in os.listdir(self.tempdir)) + + private_path = os.path.join(self.tempdir, private) + f = gzip.open(private_path) + try: + eq_(f.read(), '') + finally: + f.close() + + public_path = os.path.join(self.tempdir, public) + f = gzip.open(public_path) + try: + eq_(f.read(), '') + finally: + f.close() + + def test_run_job_no_data_but_scped(self): + config_manager = self._setup_config_manager( + public_output_path='', + private_user='peter', + private_server='secure.mozilla.org', + private_location='/var/data/', + private_ssh_command='chmod 0640 /var/data/*', + ) + + def comm(): + # no errors + return '', '' + + self.Popen().communicate.side_effect = comm + + with config_manager.context() as config: + tab = CronTabber(config) + tab.run_all() + + information = self._load_structure() + assert information['daily-url'] + assert not information['daily-url']['last_error'] + assert information['daily-url']['last_success'] + + # even though the files created are empty they should nevertheless + # be scp'ed + # can expect the command exactly + now = datetime.datetime.utcnow() - datetime.timedelta(days=1) + private = now.strftime('%Y%m%d-crashdata.csv.gz') + private_path = os.path.join(self.tempdir, private) + assert os.path.isfile(private_path) + scp_command = 'scp "%s" "peter@secure.mozilla.org:/var/data/"' % private_path + ssh_command = 'ssh "peter@secure.mozilla.org" "chmod 0640 /var/data/*"' + self.Popen.assert_any_call( + scp_command, + stdin=PIPE, stderr=PIPE, stdout=PIPE, + shell=True + ) + + self.Popen.assert_any_call( + ssh_command, + stdin=PIPE, stderr=PIPE, stdout=PIPE, + shell=True + ) + + def _insert_waterwolf_mock_data(self): + # these csv-like chunks of data are from the dataload tool + reports = """ +1,2012-06-15 10:34:45-07,2012-06-15 23:35:06.262196,0ac2e16a-a718-43c0-a1a5-6bf922111017,WaterWolf,1.0,20120615000001,FakeSignature1,http://porn.xxx,391578,,25,x86,GenuineIntel family 6 model 23 stepping 10 | 2,EXCEPTION_ACCESS_VIOLATION_READ,0x66a0665,Windows NT,5.1.2600 Service Pack 3,,"",2012-06-15 00:35:16.368154,2012-06-15 00:35:18.463317,t,f,"",,,,,"",t,9.0.124.0,,,release,{waterwolf@example.org} +2,2012-06-15 10:34:45-07,2012-06-15 23:35:06.262196,0bc2e16a-a718-43c0-a1a5-6bf922111017,WaterWolf,2.0,20120615000002,FakeSignature2,,391578,,25,x86,GenuineIntel family 6 model 23 stepping 10 | 2,EXCEPTION_ACCESS_VIOLATION_READ,0x66a0665,Windows NT,5.1.2600 Service Pack 3,,"",2012-06-15 00:35:16.368154,2012-06-15 00:35:18.463317,t,f,"",,,,,"",t,9.0.124.0,,,beta,{waterwolf@example.org} +3,2012-06-15 10:34:45-07,2012-06-15 23:35:06.262196,0cc2e16a-a718-43c0-a1a5-6bf922111017,WaterWolf,3.0a2,20120615000003,FakeSignature3,,391578,,25,x86,GenuineIntel family 6 model 23 stepping 10 | 2,EXCEPTION_ACCESS_VIOLATION_READ,0x66a0665,Windows NT,5.1.2600 Service Pack 3,,"",2012-06-15 00:35:16.368154,2012-06-15 00:35:18.463317,t,f,"",,,,,"",t,9.0.124.0,,,aurora,{waterwolf@example.org} +4,2012-06-15 10:34:45-07,2012-06-15 23:35:06.262196,0dc2e16a-a718-43c0-a1a5-6bf922111017,WaterWolf,4.0a1,20120615000004,FakeSignature4,,391578,,25,x86,GenuineIntel family 6 model 23 stepping 10 | 2,EXCEPTION_ACCESS_VIOLATION_READ,0x66a0665,Windows NT,5.1.2600 Service Pack 3,,"",2012-06-15 00:35:16.368154,2012-06-15 00:35:18.463317,t,f,"",,,,,"",t,9.0.124.0,,,nightly,{waterwolf@example.org} +5,2012-06-16 10:34:45-07,2012-06-16 23:35:06.262196,1ac2e16a-a718-43c0-a1a5-6bf922111017,WaterWolf,1.0,20120615000001,FakeSignature1,http://porn.xxx,391578,,25,x86,GenuineIntel family 6 model 23 stepping 10 | 2,EXCEPTION_ACCESS_VIOLATION_READ,0x66a0665,Windows NT,5.1.2600 Service Pack 3,,"",2012-06-16 00:35:16.368154,2012-06-16 00:35:18.463317,t,f,"",,,,,"",t,9.0.124.0,,,release,{waterwolf@example.org} +6,2012-06-16 10:34:45-07,2012-06-16 23:35:06.262196,1bc2e16a-a718-43c0-a1a5-6bf922111017,WaterWolf,2.0,20120615000002,FakeSignature2,,391578,,25,x86,GenuineIntel family 6 model 23 stepping 10 | 2,EXCEPTION_ACCESS_VIOLATION_READ,0x66a0665,Windows NT,5.1.2600 Service Pack 3,,"",2012-06-16 00:35:16.368154,2012-06-16 00:35:18.463317,t,f,"",,,,,"",t,9.0.124.0,,,beta,{waterwolf@example.org} +7,2012-06-16 10:34:45-07,2012-06-16 23:35:06.262196,1cc2e16a-a718-43c0-a1a5-6bf922111017,WaterWolf,3.0a2,20120615000003,FakeSignature3,,391578,,25,x86,GenuineIntel family 6 model 23 stepping 10 | 2,EXCEPTION_ACCESS_VIOLATION_READ,0x66a0665,Windows NT,5.1.2600 Service Pack 3,,"",2012-06-16 00:35:16.368154,2012-06-16 00:35:18.463317,t,f,"",,,,,"",t,9.0.124.0,,,aurora,{waterwolf@example.org} +8,2012-06-16 10:34:45-07,2012-06-16 23:35:06.262196,1dc2e16a-a718-43c0-a1a5-6bf922111017,WaterWolf,4.0a1,20120615000004,FakeSignature4,,391578,,25,x86,GenuineIntel family 6 model 23 stepping 10 | 2,EXCEPTION_ACCESS_VIOLATION_READ,0x66a0665,Windows NT,5.1.2600 Service Pack 3,,"",2012-06-16 00:35:16.368154,2012-06-16 00:35:18.463317,t,f,"",,,,,"",t,9.0.124.0,,,nightly,{waterwolf@example.org} + """ + reports = reports.replace( + '2012-06-16', + (datetime.datetime.utcnow() + - datetime.timedelta(days=1)).strftime('%Y-%m-%d') + ) + + lines = [] + for line in reports.strip().splitlines(): + lines.append( + 'insert into reports values (' + + ','.join(not x and '0' or x.isdigit() and str(x) or "'%s'" % x + for x in line.strip().split(',')) + + ');' + ) + + mock_sql = '\n'.join(lines + ['']) + self.conn.cursor().execute(mock_sql) + self.conn.commit() + + def test_run_job_with_mocked_data(self): + config_manager = self._setup_config_manager() + self._insert_waterwolf_mock_data() + + with config_manager.context() as config: + tab = CronTabber(config) + tab.run_all() + + information = self._load_structure() + assert information['daily-url'] + assert not information['daily-url']['last_error'] + assert information['daily-url']['last_success'] + + # this should have created two .csv.gz files + now = datetime.datetime.utcnow() - datetime.timedelta(days=1) + + private = now.strftime('%Y%m%d-crashdata.csv.gz') + public = now.strftime('%Y%m%d-pub-crashdata.csv.gz') + ok_(private in os.listdir(self.tempdir)) + ok_(public in os.listdir(self.tempdir)) + + private_path = os.path.join(self.tempdir, private) + f = gzip.open(private_path) + try: + content = f.read() + ok_(content) + lines = content.splitlines() + header = lines[0] + payload = lines[1:] + eq_(header.split('\t')[0], 'signature') + eq_(header.split('\t')[1], 'url') + urls = [x.split('\t')[1] for x in payload] + ok_('http://porn.xxx' in urls) + signatures = [x.split('\t')[0] for x in payload] + eq_(sorted(signatures), + ['FakeSignature1', + 'FakeSignature2', + 'FakeSignature3', + 'FakeSignature4']) + finally: + f.close() + + public_path = os.path.join(self.tempdir, public) + f = gzip.open(public_path) + try: + content = f.read() + ok_(content) + lines = content.splitlines() + header = lines[0] + payload = lines[1:] + eq_(header.split('\t')[0], 'signature') + eq_(header.split('\t')[1], 'URL (removed)') + urls = [x.split('\t')[1] for x in payload] + ok_('http://porn.xxx' not in urls) + signatures = [x.split('\t')[0] for x in payload] + eq_(sorted(signatures), + ['FakeSignature1', + 'FakeSignature2', + 'FakeSignature3', + 'FakeSignature4']) + finally: + f.close() + + def test_run_job_with_mocked_data_with_scp_errors(self): + config_manager = self._setup_config_manager( + public_output_path='', + private_user='peter', + private_server='secure.mozilla.org', + private_location='/var/data/', + ) + self._insert_waterwolf_mock_data() + + def comm(): + # some errors + return '', "CRAP!" + + self.Popen().communicate.side_effect = comm + + with config_manager.context() as config: + tab = CronTabber(config) + tab.run_all() + + information = self._load_structure() + assert information['daily-url'] + assert not information['daily-url']['last_error'] + assert information['daily-url']['last_success'] + ok_(config.logger.warn.called) + + def test_run_job_with_no_data_with_ssh_errors(self): + config_manager = self._setup_config_manager( + public_output_path='', + private_user='peter', + private_server='secure.mozilla.org', + private_location='/var/data/', + private_ssh_command='chmod 0640 /var/data/*', + ) + self._insert_waterwolf_mock_data() + + # any mutable so we can keep track of the number of times + # the side_effect function is called + calls = [] + + def comm(): + if calls: + # some errors + return '', "CRAP!" + else: + calls.append(1) + return '', '' + + self.Popen().communicate.side_effect = comm + + with config_manager.context() as config: + tab = CronTabber(config) + tab.run_all() + + information = self._load_structure() + assert information['daily-url'] + assert not information['daily-url']['last_error'] + assert information['daily-url']['last_success'] + + ok_(config.logger.warn.called) + + def test_run_job_with_mocked_data_with_wrong_products(self): + config_manager = self._setup_config_manager( + product='Thunderbird,SeaMonkey', + version='1.0,2.0', + public_output_path=False + ) + self._insert_waterwolf_mock_data() + + with config_manager.context() as config: + tab = CronTabber(config) + tab.run_all() + + information = self._load_structure() + assert information['daily-url'] + assert not information['daily-url']['last_error'] + assert information['daily-url']['last_success'] + + # this should have created two .csv.gz files + now = datetime.datetime.utcnow() - datetime.timedelta(days=1) + + private = now.strftime('%Y%m%d-crashdata.csv.gz') + public = now.strftime('%Y%m%d-pub-crashdata.csv.gz') + ok_(private in os.listdir(self.tempdir)) + ok_(public not in os.listdir(self.tempdir)) + + private_path = os.path.join(self.tempdir, private) + f = gzip.open(private_path) + try: + eq_(f.read(), '') + finally: + f.close() diff --git a/socorro/unittest/cron/testNamedCursor.py b/socorro/unittest/cron/testNamedCursor.py new file mode 100644 index 0000000000..6a4ca97123 --- /dev/null +++ b/socorro/unittest/cron/testNamedCursor.py @@ -0,0 +1,159 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +This is a named cursor proof of concept, flanged together quickly to use nose (see rename below) because all the +bits and pieces were lying around ready when I wrote it. I checked it in because someday, maybe, we'll want to do +another proof, if psycopg library gets smarter about fetchmany(). See one of the below (mirrors) for details: +http://www.velocityreviews.com/forums/t509431-psycopg2-amp-large-result-set.html +http://bytes.com/groups/python/652577-psycopg2-large-result-set +In essence: you can use fetchmany, but only with a named cursor, and only useful if you specify the size. +""" +import copy +import datetime as dt +import errno +import logging +import os +import psycopg2 +import time +from nose.tools import * + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.database.database as sdatabase + +from socorro.lib.datetimeutil import UTC + +from socorro.unittest.testlib.testDB import TestDB +import socorro.unittest.testlib.dbtestutil as dbtestutil +from socorro.unittest.testbase import TestCase +import cronTestconfig as testConfig + +class Me: + pass +me = None + +def addReportData(cursor, dataToAdd): + # dataToAdd is [{},...] for dictionaries of values as shown in sql below + sql = """INSERT INTO reports + (uuid, client_crash_date, date_processed, product, version, build, url, install_age, last_crash, uptime, + email, os_name, os_version, + user_id, -- ignored (no longer collected) + user_comments, + app_notes, distributor, distributor_version) VALUES -- These are ignored for testing purposes + (%(uuid)s,%(client_crash_date)s,%(date_processed)s,%(product)s,%(version)s,%(build)s,%(url)s,%(install_age)s,%(last_crash)s,%(uptime)s, + %(email)s,%(os_name)s,%(os_version)s, + 0, + %(user_comments)s, + %(app_notes)s, %(distributor)s, %(distributor_version)s)""" + + cursor.executemany(sql,dataToAdd) + cursor.connection.commit() + +def createMe(): + global me + if not me: + me = Me() + me.config = configurationManager.newConfiguration(configurationModule = testConfig, applicationName = "Testing TopCrashers") + myDir = os.path.split(__file__)[0] + if not myDir: myDir = '.' + replDict = {'testDir':'%s'%myDir} + for i in me.config: + try: + me.config[i] = me.config.get(i)%(replDict) + except: + pass + me.logFilePathname = me.config.logFilePathname + logfileDir = os.path.split(me.config.logFilePathname)[0] + try: + os.makedirs(logfileDir) + except OSError,x: + if errno.EEXIST != x.errno: raise + f = open(me.config.logFilePathname,'w') + f.close() + + fileLog = logging.FileHandler(me.logFilePathname, 'a') + fileLog.setLevel(logging.DEBUG) + fileLogFormatter = logging.Formatter(me.config.get('logFileLineFormatString','%(asctime)s %(levelname)s - %(message)s')) + fileLog.setFormatter(fileLogFormatter) + me.logger = logging.getLogger('cron_test') + me.logger.setLevel(logging.DEBUG) + me.logger.addHandler(fileLog) + me.database = sdatabase.Database(me.config) + #me.dsn = "host=%(databaseHost)s dbname=%(databaseName)s user=%(databaseUserName)s password=%(databasePassword)s" % (me.config) + +class TestNamedCursor(TestCase): + def setUp(self): + global me + if not me: + createMe() + self.testDB = TestDB() + self.testDB.removeDB(me.config, me.logger) + self.testDB.createDB(me.config, me.logger) + #self.connection = psycopg2.connect(me.dsn) + self.connection = me.database.connection() + + def tearDown(self): + global me + self.testDB.removeDB(me.config,me.logger) + self.connection.close() + + def reportDataGenerator(self,sizePerDay,numDays): + idGen = dbtestutil.moreUuid() + initialDate = dt.datetime(2008,1,1,1,1,1,1,tzinfo=UTC) + currentDate = dt.datetime(2008,1,1,1,1,1,1,tzinfo=UTC) + milli5 = dt.timedelta(milliseconds=5) + milli10 = dt.timedelta(milliseconds=10) + buildStrings = ['200712312355','200712302355','200712292355'] + buildDates = [dt.datetime(2007,12,31,23,55,tzinfo=UTC),dt.datetime(2007,12,30,23,55,tzinfo=UTC),dt.datetime(2007,12,29,23,55,tzinfo=UTC)] + osNameVersions = [('Windows NT','6.6.6'),('Windows NT','6.6.6'),('Windows','v.v.v'),('Windows','v.v.v'),('Windows','v.v.v'),('Windows','v.v.v'), + ('Mac OS X','10.5.5'),('Mac OS X','10.5.6'),('Mac OS X','10.5.6'), + ('Linux','10.10.10'),('Linux','10.10.11'), + ] + insData = [] + for dummyDays in range(numDays): + count = 0 + while count < sizePerDay: + os_name,os_version = osNameVersions[count % len(osNameVersions)] + data = { + 'uuid':idGen.next(), + 'client_crash_date':currentDate, + 'date_processed': currentDate+milli5, + 'product': 'foxy', + 'version': '3.6.9b2', + 'build': buildStrings[count%len(buildStrings)], + 'url':'http://www.woo.wow/weee', + 'install_age':89000, + 'last_crash':0, + 'uptime':88000, + 'email':None, + 'os_name': os_name, + 'os_version': os_version, + #'build_date': buildDates[count%len(buildDates)], + 'user_comments': 'oh help', + 'app_notes':"", + 'distributor':"", + 'distributor_version':"", + } + insData.append(data) + if not count%(3): + currentDate += milli10 + count += 1 + currentDate = initialDate+dt.timedelta(days=1) + cursor = self.connection.cursor() + addReportData(cursor,insData) + + def build1000(self): #testBuild1000(self): + self.reportDataGenerator(1000,10) + ncursor = self.connection.cursor('myCursor') + ncursor.execute('SELECT id,uuid,client_crash_date,date_processed from reports') + try: + while True: + data = ncursor.fetchmany(512) + if data and len(data): + print data[0][0],len(data) + else: + print "Broke: %s"%(data) + break + finally: + self.connection.commit() diff --git a/socorro/unittest/database/createSchema.py b/socorro/unittest/database/createSchema.py new file mode 100755 index 0000000000..2a55da0f53 --- /dev/null +++ b/socorro/unittest/database/createSchema.py @@ -0,0 +1,63 @@ +#! /usr/bin/env python +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# XXX Set to be deprecated in favor of socorro/external/postgresql/models.py + +""" +Just set up the database and exit. Assume we can get config details from the test config file, but allow sys.argv to override +""" +import logging +import sys +import socorro.lib.ConfigurationManager as configurationManager +from socorro.unittest.testlib.testDB import TestDB +import dbTestconfig as testConfig + +def help(): + print """Usage: (python) createSchema.py [config-options] [--help] +First removes all the known socorro tables, then creates an instance of +the current socorro schema in an existing database. Does NOT drop tables +other than the ones known to this schema. +Default: use current unittest config for host, database, user and password. + --help: print this message and exit + config-options: You may pass any of the following: + [--]host=someHostName + [--]dbname=someDatabaseName + [--]user=someUserName + [--]password=somePassword + """ + +def main(): + logger = logging.getLogger("topcrashes_summary") + logger.setLevel(logging.WARNING) + + stderrLog = logging.StreamHandler() + stderrLog.setLevel(logging.WARNING) + stderrLogFormatter = logging.Formatter('%(asctime)s %(levelname)s - %(message)s') + stderrLog.setFormatter(stderrLogFormatter) + logger.addHandler(stderrLog) + + kwargs = {} + for i in sys.argv[1:]: + if i.startswith('-h') or i.startswith('--he'): + help() + sys.exit(0) + j = i + if i.startswith('-'): + j = i.lstrip('-') + if '=' in j: + name,value = (s.strip() for s in j.split('=')) + kwargs[name] = value + else: + print >> sys.stderr,"Ignoring unkown argument '%s'"%(i) + sys.argv = sys.argv[:1] + config = configurationManager.newConfiguration(configurationModule = testConfig, applicationName='Create Database') + config.update(kwargs) + testDB = TestDB() + testDB.removeDB(config,logger) + testDB.createDB(config,logger) + +if __name__ == '__main__': + main() + diff --git a/socorro/unittest/database/dbTestconfig.py b/socorro/unittest/database/dbTestconfig.py new file mode 100644 index 0000000000..a53fcae8d1 --- /dev/null +++ b/socorro/unittest/database/dbTestconfig.py @@ -0,0 +1,23 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# XXX Set to be deprecated in favor of socorro/external/postgresql/models.py + +## Test config file for database utilities +import socorro.lib.ConfigurationManager as cm +import datetime + +from socorro.unittest.config.commonconfig \ + import databaseHost as database_hostname +from socorro.unittest.config.commonconfig \ + import oldDatabaseName as database_name +from socorro.unittest.config.commonconfig \ + import databaseUserName as database_username +from socorro.unittest.config.commonconfig \ + import databasePassword as database_password + +logFilePathname = cm.Option() +logFilePathname.doc = 'full pathname for the log file' +logFilePathname.default = '%(testDir)s/logs/db_test.log' + diff --git a/socorro/unittest/database/testCachedIdAccess.py b/socorro/unittest/database/testCachedIdAccess.py new file mode 100644 index 0000000000..4fa775942d --- /dev/null +++ b/socorro/unittest/database/testCachedIdAccess.py @@ -0,0 +1,360 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# XXX Set to be deprecated in favor of socorro/external/postgresql/models.py + +import socorro.database.cachedIdAccess as cia + +import errno +import logging +import os +import time + +from nose.tools import * +from nose.plugins.skip import SkipTest + +import psycopg2 + +import dbTestconfig as testConfig +from socorro.unittest.testlib.testDB import TestDB +import socorro.lib.ConfigurationManager as configurationManager + +class Me: # Class 'Me' is just a way to say 'global' once per method + pass +me = None +# Note: without setup_module and teardown_module we get weird double, treble and more multiple (logging of) calls +# to testDB.removeDB() and testDB.createDB using nosetests as the driver. I have no idea, but this is a pragmatic +# work around: do setup and teardown at the (spelled) module level. + +def setup_module(): + global me + if me: + return + me = Me() + me.testDB = TestDB() + me.config = configurationManager.newConfiguration(configurationModule = testConfig, applicationName='TestingCachedIdAccess') + myDir = os.path.split(__file__)[0] + if not myDir: myDir = '.' + replDict = {'testDir':'%s'%myDir} + for i in me.config: + try: + me.config[i] = me.config.get(i)%(replDict) + except: + pass + cia.logger.setLevel(logging.DEBUG) + me.logFilePathname = me.config.logFilePathname + logfileDir = os.path.split(me.config.logFilePathname)[0] + try: + os.makedirs(logfileDir) + except OSError,x: + if errno.EEXIST != x.errno: raise + f = open(me.config.logFilePathname,'w') + f.close() + + fileLog = logging.FileHandler(me.logFilePathname, 'a') + fileLog.setLevel(logging.DEBUG) + fileLogFormatter = logging.Formatter('%(asctime)s %(levelname)s - %(message)s') + fileLog.setFormatter(fileLogFormatter) + cia.logger.addHandler(fileLog) + me.logger = cia.logger + me.dsn = "host=%s dbname=%s user=%s password=%s" % (me.config.database_hostname,me.config.database_name, + me.config.database_username,me.config.database_password) + +def teardown_module(): + me.testDB = None + +class TestCachedIdAccess: + def setUp(self): + global me + assert me, 'DAMN! what happened?!' + # Remove/Create is being tested elsewhere via models.py & setupdb_app.py now + me.testDB.removeDB(me.config,me.logger) + me.testDB.createDB(me.config,me.logger) + self.connection = psycopg2.connect(me.dsn) + + def tearDown(self): + global me + sql = 'DELETE from %s' + cursor = self.connection.cursor() + me.testDB.removeDB(me.config,me.logger) + + def testCreateProductRelease(self): + """ + TestCachedIdAccess:testCreateProductRelease(self): + for a bunch of possible version strings, check that the calculated release name is appropriate + """ + data = [ + # test for major and fails major + ('a.b.c',None), + ('presumptive.behavior',None), + ('0.0','major'), + ('3.4','major'), + ('1.2.3','major'), + ('0.123456','major'), + ('123456.0','major'), + ('1.2.3.4.5.6.7.8.9.10','major'), + ('a.2.3.4.5.6.7.8.9.10',None), + ('1.2.3.4.5.b.7.8.9.10',None), + ('1.2.3.4.5.6.7.8.9.a',None), + ('1.2.3.4.5.6.7.8.9.b',None), + ('1.',None), + ('1.2.',None), + ('1.2.3.',None), + ('1,2',None), + ('1..2.3',None), + ('0',None), + ('5',None), + ('12345',None), + ('.1',None), + ('.1.2',None), + ('.1.2.3',None), + ('.1.2.3.',None), + + # development and fails release + ('1.2a','development'), + ('1.2a1','development'), + ('1.2.3.4.5.6.7.8.9a','development'), + ('1.2.3.4.5.6.7.8.9a9','development'), + ('1.1234567a','development'), + ('1.1234567a999','development'), + ('1.2b','development'), + ('1.2b2','development'), + ('1.1234567b','development'), + ('1.1234567b2233','development'), + ('1.2.3.4.5.6.7.8.9b','development'), + ('1.2.3.4.5.6.7.8.9b9','development'), + ('1.a',None), + ('1.a3',None), + ('1.b',None), + ('1.b4',None), + ('1.2.3c',None), + ('1.2.3c5',None), + ('a1.2.3',None), + ('b1.2.3',None), + #('1.2.3.4.5.6.7.8.9.a',None), # done above + + # milestone and fails milestone + ('3.1pre','milestone'), + ('3.1apre','milestone'), + ('3.1bpre','milestone'), + ('3.1a1pre','milestone'), + ('3.1b99pre','milestone'), + ('pre3.1',None), + ('pre3.1a',None), + ('3.1prea',None), + ('3.1preb',None), + ('3.1apre1',None), + ('3.1prea1',None), + ('3.1.pre',None), + ] + for trial,expect in data: + got = cia.createProductRelease(trial) + assert expect == got, "For '%s': Expected '%s', got '%s'"%(trial,expect,got) + + #def testConstructor(self): # Fully tested in testClearAndInitializeCache() + + def testShrinkIdCache(self): + idCache = dict((x,x) for x in range(7)) + idCount = dict((x,10-x) for x in range(7)) + # expect the first half of the map + expectCache = dict((x,x) for x in range(4)) + expectCount = dict((x,1) for x in range(4)) + gotCache, gotCount = cia.shrinkIdCache(idCache,idCount) + assert expectCache == gotCache, 'Expect %s, got %s'%(expectCache,gotCache) + assert expectCount == gotCount, 'Expect %s, got %s'%(expectCount,gotCount) + # expect the same if the key to save is already in the saved part + gotCache, gotCount = cia.shrinkIdCache(idCache,idCount,oneKeyToSave=2) + assert expectCache == gotCache, 'Expect %s, got %s'%(expectCache,gotCache) + assert expectCount == gotCount, 'Expect %s, got %s'%(expectCount,gotCount) + # expect the key and its count if key to save would have been discarded + gotCache, gotCount = cia.shrinkIdCache(idCache,idCount,oneKeyToSave=5) + expectCache[5] = 5 + expectCount[5] = 1 + assert expectCache == gotCache, 'Expect %s, got %s'%(expectCache,gotCache) + assert expectCount == gotCount, 'Expect %s, got %s'%(expectCount,gotCount) + # expect reverse, just to be sure + idCount = dict((x,10+x) for x in range(7)) + expectCache = dict((x,x) for x in range(3,7)) + expectCount = dict((x,1) for x in range(3,7)) + gotCache, gotCount = cia.shrinkIdCache(idCache,idCount) + assert expectCache == gotCache, 'Expect %s, got %s'%(expectCache,gotCache) + assert expectCount == gotCount, 'Expect %s, got %s'%(expectCount,gotCount) + assert_raises(KeyError,cia.shrinkIdCache,idCache,idCount,99) + + def testAssureAndGetId(self): + createSql = """CREATE TABLE moolah ( + id serial not null primary key, + n text NOT NULL, + o text NOT NULL, + p text + ); + CREATE INDEX moolah_no ON moolah (n,o); + """ + dropSql = "DROP TABLE IF EXISTS moolah CASCADE" + delSql = "DELETE FROM moolah" + getSqlk = "SELECT id from moolah WHERE n=%s and o=%s" + putSqlk = "INSERT INTO moolah (n,o) VALUES(%s,%s)" + getSqld = "SELECT id from moolah WHERE n=%(n)s and o=%(o)s" + putSqld = "INSERT INTO moolah (n,o,p) VALUES(%(n)s,%(o)s,%(p)s)" + checkSql= "SELECT id,n,o,p from moolah" + countSql= "SELECT count(id) from moolah" + cursor = self.connection.cursor() + try: + # setup + cursor.execute(createSql) + self.connection.commit() + #end of setup + idc = cia.IdCache(cursor) + ktests = [ + (('n','o',),1), + (('nn','oo'),2), + (('nn','o'),3), + (('nn','oo'),2), + (('nn','o'),3), + ] + + dtests = [ + (('n','o',),{'n':'n','o':'o','p':'p'},1), + (('nn','oo'),{'n':'nn','o':'oo','p':'pp'},2), + (('nn','o'),{'n':'nn','o':'o','p':'p'},3), + (('nn','oo'),{'n':'nn','o':'oo','p':'pp'},2), + (('nn','o'),{'n':'nn','o':'o','p':'pp'},3), + ] + + # test with key and no cache + # - the database gets each (and only) new key + # - the ids are as expected + idCache = None + idCount = None + idSet = set() + rowCount = 0 + for v in ktests: + id = idc.assureAndGetId(v[0],'moolah',getSqlk,putSqlk,idCache,idCount) + if not id in idSet: + rowCount += 1 + idSet.add(id) + assert v[1] == id, 'Expected %s, got %s'%(v[1],id) + cursor.execute(checkSql) + data = cursor.fetchall() + self.connection.commit() + assert (v[1],v[0][0],v[0][1], None) in data + assert len(data) == rowCount + assert idCache == idCount + assert idCache == None + + cursor.execute(delSql) + self.connection.commit() + + # test with key and full cache: + # - know the id from the cache + # - the database isn't updated + idCache = {('n','o'):23, ('nn','oo'):24, ('nn','o'):25} + idCount = {('n','o'):5, ('nn','oo'):10, ('nn','o'):10} + testIdCount = {('n','o'):5, ('nn','oo'):10, ('nn','o'):10} + for v in ktests: + id = idc.assureAndGetId(v[0],'moolah',getSqlk,putSqlk,idCache,idCount) + assert idCache.get(v[0]) == id + testIdCount[v[0]] += 1 + cursor.execute(countSql) + self.connection.commit() + count = cursor.fetchone() + assert 0 == count[0] + assert testIdCount == idCount + + cursor.execute(dropSql) + cursor.execute(createSql) + self.connection.commit() + # test with key and initially empty cache: + idSet = set() + rowCount = 0 + idCache = {} + idCount = {('n','o'):5, ('nn','oo'):10, ('nn','o'):10} + testIdCount = {('n','o'):0, ('nn','oo'):0, ('nn','o'):0} + for v in ktests: + id = idc.assureAndGetId(v[0],'moolah',getSqlk,putSqlk,idCache,idCount) + if not id in idSet: + rowCount += 1 + idSet.add(id) + assert idCache.get(v[0]) == id + assert v[1] == id + testIdCount[v[0]] += 1 + cursor.execute(countSql) + self.connection.commit() + count = cursor.fetchone() + assert rowCount == count[0] + assert testIdCount == idCount + + cursor.execute(dropSql) + cursor.execute(createSql) + self.connection.commit() + # test with dictKey and full cache: + # - know the id from the cache + # - the database isn't updated + idCache = {('n','o'):23, ('nn','oo'):24, ('nn','o'):25} + idCount = {('n','o'):5, ('nn','oo'):10, ('nn','o'):10} + testIdCount = {('n','o'):5, ('nn','oo'):10, ('nn','o'):10} + for v in dtests: + id = idc.assureAndGetId(v[0],'moolah',getSqld,putSqld,idCache,idCount,dkey=v[1]) + assert idCache.get(v[0]) == id + testIdCount[v[0]] += 1 + cursor.execute(countSql) + self.connection.commit() + count = cursor.fetchone() + assert 0 == count[0] + assert testIdCount == idCount + + cursor.execute(dropSql) + cursor.execute(createSql) + self.connection.commit() + # test with dictKey and initially empty cache: + idSet = set() + rowCount = 0 + idCache = {} + idCount = {('n','o'):5, ('nn','oo'):10, ('nn','o'):10} + testIdCount = {('n','o'):0, ('nn','oo'):0, ('nn','o'):0} + for v in dtests: + id = idc.assureAndGetId(v[0],'moolah',getSqld,putSqld,idCache,idCount,dkey=v[1]) + if not id in idSet: + rowCount += 1 + idSet.add(id) + assert idCache.get(v[0]) == id + assert v[2] == id + testIdCount[v[0]] += 1 + cursor.execute(countSql) + self.connection.commit() + count = cursor.fetchone() + assert rowCount == count[0] + assert testIdCount == idCount + + finally: + # teardown + cursor.execute(dropSql) + self.connection.commit() + + def testGetAppropriateOsVersion(self): + cursor = self.connection.cursor() + idc = cia.IdCache(cursor) + testList = [ + (('','5.1.2600 SP2'),'5.1.2600 SP2'), + (('Windows NT',''),''), + (('Windows NT','5.1.2600 SP2'),'5.1.2600 SP2'), + (('Windows NT','5.1.2600 SP3'),'5.1.2600 SP3'), + (('Windows','5.1.2600 SP2'),'5.1.2600 SP2'), + (('Windows NT','5.1.2600 SP3'),'5.1.2600 SP3'), + (('Linux', '0.0.0 Linux 1.2.3 i586 Linux'),'1.2.3 i586'), + (('Linux', '0.0.0 Linux 2.4.6_flitteration x86_64 Linux'),'2.4.6 x86_64'), + (('Linux', '0.0.0 Linux 2.4.6.flapitation x86_64 Linux'),'2.4.6 x86_64'), + (('Linux', '0.0.0 Linux 2.4.6.flapitation-very-long'),'2.4.6 ?arch?'), + (('Linux', '0.0.0 Linux 2.4.6.flapitation-very-long x86_6'),'2.4.6 ?arch?'), + (('Linux', '0.0.0 Linux 2.4.6.flapitation-very-very-very-long-really'),'2.4.6 ?arch?'), + (('Linux', '0.0.0 Linux 1.2.3 i586 Linux'),'1.2.3 i586'), + (('Linux', '1.2.3 i686'),''), + (('Namby', 'wiggle room'),'wiggle room'), + (('Linux', 'Linux 1.2.3 i586 Linux'),''), + (('Linux', '0.0.0 Linux non-numeric-version-string i586 Linux'),''), + ] + for testCase in testList: + got = idc.getAppropriateOsVersion(*testCase[0]) + assert testCase[1] == got,'From "%s": Expected "%s", got "%s"'%(testCase[0][1],testCase[1],got) + diff --git a/socorro/unittest/database/testDatabase.py b/socorro/unittest/database/testDatabase.py new file mode 100644 index 0000000000..8df7d890a9 --- /dev/null +++ b/socorro/unittest/database/testDatabase.py @@ -0,0 +1,295 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# XXX Set to be deprecated in favor of socorro/external/postgresql/models.py + +import socorro.database.database as db +import psycopg2 +import psycopg2.extensions +import logging +import threading + +from socorro.unittest.testlib.loggerForTest import TestingLogger +from createDBforTest import * +import socorro.lib.util as util +from socorro.unittest.testbase import TestCase +import socorro.lib.ConfigurationManager as cm +import dbTestconfig as testConfig +config = cm.newConfiguration(configurationModule = testConfig, applicationName='Testing Psycopghelper') + +""" +Assume that psycopg2 works, then all we need to do is assure ourselves +that our simplistic wrap around a returned array is correct +""" + +class TestMultiCursor(psycopg2.extensions.cursor): + def __init__(self,numCols = 4, numRows=2, **kwargs): + self.result = [] + for i in range(numRows): + aRow = [] + for j in range(numCols): + aRow.append('Row %d, Column %d' %(i,j)) + self.result.append(aRow) + self.next = self.__next() + def execute(self,sql, args=None): + pass + def fetchall(self): + return self.result + def __next(self): + index = 0 + while True: + try: + yield self.result[index] + index += 1 + except: + yield None + def fetchone(self): + try: + return self.next.next() + except: + return None + +class TestEmptyCursor(psycopg2.extensions.cursor): + def __init__(self): + self.result = [] + def execute(self,sql, args=None): + pass + def fetchall(self): + return self.result + +class TestSingleCursor(psycopg2.extensions.cursor): + def __init__(self): + self.result = [['Row 0, Column 0']] + def execute(self,sql, args=None): + pass + def fetchall(self): + return self.result + + +class TestDatabase(TestCase): + def setUp(self): + self.logger = TestingLogger() + self.connectionData0 = (config.database_hostname,config.database_name,config.database_username,config.database_password) + self.connectionDataL = (config.database_hostname,config.database_name,config.database_username,config.database_password,self.logger) + self.dsn = "host=%s dbname=%s user=%s password=%s" % self.connectionData0 + self.connection = psycopg2.connect(self.dsn) + createDB(self.connection) + + def tearDown(self): + dropDB(self.connection) + self.connection.close() + + def testExecute(self): + aCursor = TestMultiCursor(numCols=1,numRows=3) + f = db.execute(aCursor,"") + vals = [x for x in f] + assert 3 == len(vals) + assert 'Row 0, Column 0' == vals[0][0] + assert 'Row 2, Column 0' == vals[-1][0] + aCursor = TestMultiCursor(numCols=1,numRows=1) + + def testSingleValueEmpty(self): + try: + cur = TestEmptyCursor() + db.singleValueSql(cur,"") + assert False, "must raise SQLDidNotReturnSingleValue" + except db.SQLDidNotReturnSingleValue,e: + pass + + def testSingleValueSingle(self): + try: + cur = TestSingleCursor() + assert "Row 0, Column 0" == db.singleValueSql(cur,"") + except Exception, e: + assert False, "must not raise an exception for this %s" %e + + def testSingleValueMulti(self): + try: + cur = TestMultiCursor(numRows=5) + assert "Row 0, Column 0" == db.singleValueSql(cur,"") + except Exception, e: + assert False, "must not raise an exception for this "+e + + def testSingleRowEmpty(self): + try: + cur = TestEmptyCursor() + db.singleRowSql(cur,"") + assert False, "must raise SQLDidNotReturnSingleRow" + except db.SQLDidNotReturnSingleRow,e: + pass + + def testSingleRowSingle(self): + try: + cur = TestSingleCursor() + assert ["Row 0, Column 0"] == db.singleRowSql(cur,"") + except Exception, e: + assert False, "must not raise this exception" + + def testSingleRowMulti(self): + try: + cur = TestMultiCursor(numRows=5, numCols=1) + assert ["Row 0, Column 0"] == db.singleRowSql(cur,"") + except Exception, e: + assert False, "must not raise this exception" + + def testDatabaseInstantiation(self): + sample1 = {'database_hostname': 'A','database_port': 'B','database_name': 'C','database_username': 'D','database_password': 'E',} + d = db.Database(sample1) + assert d.dsn == 'host=A port=B dbname=C user=D password=E', 'dsn not created correctly' + assert type(d.logger) == type(util.FakeLogger()), 'should have a %s but got %s instead' % (type(util.FakeLogger()), type(d.logger)) + d = db.Database(sample1, 1) + assert d.logger == 1, 'logger pass as a parameter was not saved, got %s instead' % d.logger + sample1 = {'database_hostname': 'A','database_port': 'B','database_name': 'C','database_username': 'D','database_password': 'E', 'logger':2} + d = db.Database(sample1) + assert d.dsn == 'host=A port=B dbname=C user=D password=E', 'dsn not created correctly' + assert d.logger == 2, 'logger passed with dictionary was not saved, got %s instead' % d.logger + d = db.Database(sample1, 1) + assert d.dsn == 'host=A port=B dbname=C user=D password=E', 'dsn not created correctly' + assert d.logger == 1, 'logger passed with dictionary was not overridden by logger passed as a parameter, got %s instead' % d.logger + + + def testConnectionPoolConstructor(self): + # just test some variations on constructor calls + logger = self.logger + logger.clear() + try: + cp = db.DatabaseConnectionPool() + assert False, 'expected a raised TypeError, not to get here' + except TypeError,x: + pass + except Exception,x: + assert False, 'expected a TypeError, not %s: %s'%(type(x),x) + try: + cp = db.DatabaseConnectionPool(config) + except Exception,x: + assert False, 'expected the non-logger constructor to succeed, got %s: %s'%(type(x),x) + try: + cp = db.DatabaseConnectionPool(config, self.logger) + except Exception,x: + assert False, 'expected the with-logger constructor to succeed, got %s: %s'%(type(x),x) + + def testConnectionPoolConnectToDatabase(self): + logger = self.logger + logger.clear() + cp = db.DatabaseConnectionPool(config, logger) + logger.clear() + try: + connection,cursor = cp.connectionCursorPair() + assert connection + assert cursor + except Exception,x: + assert False, 'expected no exceptions, got %s: %s'%(type(x),x) + + def testConnectionPoolConnectionCursorPair(self): + logger = self.logger + logger.clear() + cp = db.DatabaseConnectionPool(config, logger) + connection0 = cursor0 = None + try: + connection0, cursor0 = cp.connectionCursorPair() + assert connection0 + assert cursor0 + except Exception,x: + assert False, 'expected nothing, got %s: %s'%(type(x),x) + connection1, cursor1 = cp.connectionCursorPair() + assert connection0 == connection1 + assert cursor0 != cursor1 + + logger.clear() + cp = db.DatabaseConnectionPool(config, logger) + connection0 = cursor0 = None + try: + connection0,cursor0 = cp.connectionCursorPair() + except Exception,x: + assert False, 'Expected OperationalError above, got %s: %s' %(type(x),x) + + def testConnectionPoolCleanup(self): + class FakeLogger(object): + def __init__(self): + self.logs = [] + def debug(self, *args): + self.logs.append(args) + logger = FakeLogger() + cp = db.DatabaseConnectionPool(config, logger) + conn = cp.connection() + cp.cleanup() + conn = cp.connection() + conn = cp.connection('fred') + cp.cleanup() + expected = [('%s - killing database connections', 'MainThread'), + ('%s - connection %s closed', 'MainThread', 'MainThread'), + ('%s - killing database connections', 'MainThread'), + ('%s - connection %s closed', 'MainThread', 'MainThread'), + ('%s - connection %s closed', 'MainThread', 'fred')] + assert len(expected) == len(logger.logs) + for e, a in zip(expected, logger.logs): + assert e == a + + def test_connection_attempt_count(self): + logger = self.logger + logger.clear() + class ConnectionCountingFakeDatabase(object): + def __init__(self, config, logger=None): + self.connect_counter = 0 + def connection(self, database_module=None): + self.connect_counter += 1 + return 17 + logger = self.logger + temp_Database = db.Database + db.Database = ConnectionCountingFakeDatabase + try: + db_pool = db.DatabaseConnectionPool(config, logger) + c1 = db_pool.connection() + assert db_pool.database.connect_counter == 1 + c1 = db_pool.connection() + assert db_pool.database.connect_counter == 1 + c1 = db_pool.connection('fred') + assert db_pool.database.connect_counter == 2 + c1 = db_pool.connection() + assert db_pool.database.connect_counter == 2 + c1 = db_pool.connection('fred') + assert db_pool.database.connect_counter == 2 + finally: + db.Database = temp_Database + + + def testLoggingCursorExecute(self): + logCursor = self.connection.cursor(cursor_factory=db.LoggingCursor) + logCursor.setLogger(self.logger) + self.logger.clear() + logCursor.execute('select 4;') + assert logging.INFO == self.logger.levels[0], "Expect level %s, go %s"%(logging.INFO,self.logger.levels[0]) + assert self.logger.buffer[0] == 'select 4;','... but got %s'%(self.logger.buffer[0]) + params = {'id':3} + logCursor.execute("select id from gringo where id=%(id)s;",params) + assert logging.INFO == self.logger.levels[1] + expected = "select id from gringo where id=%(id)s;"%(params) + got = self.logger.buffer[1] + assert expected == got, "Expected [%s] but got [%s]"%(expected,got) + params = [3] + logCursor.execute("select id from gringo where id=%s;",params) + expected = "select id from gringo where id=%s;"%(params[0]) + got = self.logger.buffer[2] + assert expected == got, "Expected [%s] but got [%s]"%(expected,got) + + def testLoggingCursorExecutemany(self): + logCursor = self.connection.cursor(cursor_factory=db.LoggingCursor) + logCursor.setLogger(self.logger) + self.logger.clear() + def chargen(): + for i in 'abcdef': + yield (i,) + + logCursor.executemany("insert into chartable values (%s)",chargen()) + assert self.logger.buffer[0] == 'insert into chartable values (%s) ...' + assert self.logger.levels[0] == logging.INFO + data = ('g','h') + logCursor.executemany("insert into chartable values (%s)",data) + assert self.logger.buffer[1].startswith("insert into chartable values") + assert "'g'" in self.logger.buffer[1] + assert "..." in self.logger.buffer[1] + assert self.logger.levels[1] == logging.INFO + +if __name__ == "__main__": + unittest.main() diff --git a/socorro/unittest/database/testPostgresql.py b/socorro/unittest/database/testPostgresql.py new file mode 100644 index 0000000000..69f73dc5c8 --- /dev/null +++ b/socorro/unittest/database/testPostgresql.py @@ -0,0 +1,436 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# XXX Set to be deprecated in favor of socorro/external/postgresql/models.py + +import errno +import logging +import os + +import psycopg2 + +import socorro.lib.ConfigurationManager as configurationManager +import socorro.database.postgresql as postg +import socorro.database.schema as schema + +from socorro.unittest.testlib.loggerForTest import TestingLogger +from socorro.unittest.testlib.testDB import TestDB + +from nose.tools import * + +import dbTestconfig as testConfig + +testTableNames = [ + "foo", + "foo_1", + "foo_2", + "a_foo", + "boot", + "rip", + ] +testTablePatterns = { + 'foo%':['foo','foo_1','foo_2',], + 'foo_%':['foo_1','foo_2',], + '%foo':['foo','a_foo',], + '%oo%':['foo','foo_1','foo_2','a_foo','boot'], + 'rip':['rip'], + 'rap':[], + } +class Me: # not quite "self" + """ + I need stuff to be initialized once per module. Rather than having a bazillion globals, lets just have 'me' + """ + pass +me = None + +def setup_module(): + global me + if me: + return + me = Me() + # config gets messed up by some tests. Use this one during module setup and teardown + me.config = configurationManager.newConfiguration(configurationModule = testConfig, applicationName='Testing Postgresql Utils') + myDir = os.path.split(__file__)[0] + if not myDir: myDir = '.' + replDict = {'testDir':'%s'%myDir} + for i in me.config: + try: + me.config[i] = me.config.get(i)%(replDict) + except: + pass + me.logFilePathname = me.config.logFilePathname + if not me.logFilePathname: + me.logFilePathname = 'logs/db_test.log' + logFileDir = os.path.split(me.logFilePathname)[0] + try: + os.makedirs(logFileDir) + except OSError,x: + if errno.EEXIST == x.errno: pass + else: raise + f = open(me.logFilePathname,'w') + f.close() + fileLog = logging.FileHandler(me.logFilePathname, 'a') + fileLog.setLevel(logging.DEBUG) + fileLogFormatter = logging.Formatter('%(asctime)s %(levelname)s - %(message)s') + fileLog.setFormatter(fileLogFormatter) + me.logger = logging.getLogger("testPostresql") + me.logger.addHandler(fileLog) + me.dsn = "host=%s dbname=%s user=%s password=%s" % (me.config.database_hostname,me.config.database_name, + me.config.database_username,me.config.database_password) + +def teardown_module(): + try: + os.unlink(me.logFilePathname) + except: + pass + +class TestPostgresql: + def setUp(self): + global me + # config gets messed up by some tests. Use this one by preference + self.config = configurationManager.newConfiguration(configurationModule = testConfig, applicationName='Testing Postgresql Utils') + for i in self.config: + try: + self.config[i] = self.config.get(i)%(replDict) + except: + pass + self.connection = psycopg2.connect(me.dsn) + self.testDB = TestDB() + + def tearDown(self): + cursor = self.connection.cursor() + dropSql = "drop table if exists %s" + for tn in testTableNames: + cursor.execute(dropSql%tn) + self.connection.commit() + self.connection.close() + + def testTablesMatchingPattern(self): + cursor = self.connection.cursor() + createSql = "CREATE TABLE %s (id integer)" # postgresql allows empty tables, but it makes me itch... + for tn in testTableNames: + cursor.execute(createSql%tn) + self.connection.commit() + for pat in testTablePatterns: + result = postg.tablesMatchingPattern(pat,cursor) + expected = testTablePatterns[pat] + assert set(expected)==set(result), "for %s: expected:%s, result:%s"%(pat,expected,result) + self.connection.commit() + + #our database is not currently using any triggers + #def testTriggersForTable(self): + #""" + #TestPostgresql:testTriggersForTable(self) + #- If you have trouble with (only) this test, be sure the db owner has executed CREATE LANGUAGE PLPGSQL; for the test db + #""" + #global me + #cursor = self.connection.cursor() + #setupSql = """ + #DROP TABLE IF EXISTS ttrigs; + #CREATE TABLE ttrigs (id serial); + #""" + #makeTriggerSql = """ + #CREATE OR REPLACE FUNCTION check_trigger() returns trigger AS ' + #BEGIN + #RETURN new; + #END + #' LANGUAGE plpgsql; + #CREATE TRIGGER check_trigger_t AFTER INSERT ON ttrigs FOR EACH ROW EXECUTE PROCEDURE check_trigger(); + #""" + #try: + #cursor.execute(setupSql) + #self.connection.commit() + #theList = postg.triggersForTable('ttrigs',cursor) + #assert [] == theList + #cursor.execute(makeTriggerSql) + #self.connection.commit() + #theList = postg.triggersForTable('ttrigs',cursor) + #assert ['check_trigger_t'] == theList + #cursor.execute("CREATE TRIGGER check_trigger_2 AFTER UPDATE ON ttrigs FOR EACH ROW EXECUTE PROCEDURE check_trigger();") + #self.connection.commit() + #theList = postg.triggersForTable('ttrigs',cursor) + #assert set(['check_trigger_t', 'check_trigger_2']) == set(theList),'but got %s'%(set(theList),) + #finally: + #cursor.execute("DROP TABLE IF EXISTS ttrigs") + #self.connection.commit() + + def testIndexesForTable(self): + global me + cursor = self.connection.cursor() + setupSql = """ + DROP TABLE IF EXISTS tindex; + CREATE TABLE tindex (id serial,i integer, f float); + """ + try: + cursor.execute(setupSql) + self.connection.commit() + indices = postg.indexesForTable('tindex',cursor) + assert [] == indices + cursor.execute("CREATE INDEX ti_id ON tindex (id);") + self.connection.commit() + indices = postg.indexesForTable('tindex',cursor) + assert ['ti_id'] == indices + cursor.execute("CREATE INDEX ti_i ON tindex (i);") + self.connection.commit() + indices = postg.indexesForTable('tindex',cursor) + assert set(['ti_id','ti_i']) == set(indices), "but got %s"%(set(indices)) + cursor.execute("CREATE INDEX ti_i_f ON tindex (i,f);") + self.connection.commit() + indices = postg.indexesForTable('tindex',cursor) + assert set(['ti_id','ti_i','ti_i_f']) == set(indices), 'but %s'%(indices) + finally: + cursor.execute("DROP TABLE IF EXISTS tindex;") + self.connection.commit() + + def testRulesForTable(self): + global me + cursor = self.connection.cursor() + setupSql = """ + DROP TABLE IF EXISTS trules; + CREATE TABLE trules (id serial,i integer); + """ + try: + cursor.execute(setupSql) + self.connection.commit() + rules = postg.rulesForTable('trules',cursor) + assert [] == rules + cursor.execute("CREATE RULE notify_me AS ON UPDATE TO trules DO NOTIFY trules;") + self.connection.commit() + assert ['notify_me'] == postg.rulesForTable('trules',cursor) + finally: + cursor.execute("DROP TABLE IF EXISTS trules;") + self.connection.commit() + + def testContraintsAndTypeForTable(self): + global me + setupSql = """ + DROP TABLE IF EXISTS tcnt; + CREATE TABLE tcnt (id integer, i integer); + """ + cursor = self.connection.cursor() + try: + cursor.execute(setupSql) + self.connection.commit() + assert [] == postg.constraintsAndTypeForTable('tcnt',cursor) + cursor.execute("ALTER TABLE tcnt ADD CONSTRAINT tcnt_pkey PRIMARY KEY(id)") + self.connection.commit() + assert [('tcnt_pkey','p')] == postg.constraintsAndTypeForTable('tcnt',cursor) + cursor.execute("ALTER TABLE tcnt ADD CONSTRAINT tcnt_nnu UNIQUE(i)") + self.connection.commit() + assert set([('tcnt_pkey', 'p'), ('tcnt_nnu', 'u')]) == set(postg.constraintsAndTypeForTable('tcnt',cursor)) + fkSql = setupSql.replace('tcnt','fkcnt') + cursor.execute(fkSql) + self.connection.commit() + cursor.execute("ALTER TABLE fkcnt ADD CONSTRAINT fk_cn_id_fkey FOREIGN KEY(i) REFERENCES tcnt(id)") + self.connection.commit() + assert [('fk_cn_id_fkey', 'f')] == postg.constraintsAndTypeForTable('fkcnt',cursor) + finally: + cursor.execute("DROP TABLE IF EXISTS tcnt, fkcnt CASCADE") + self.connection.commit() + + def testColumnNameTypeDictionaryForTable(self): + global me + dropSql = "DROP TABLE IF EXISTS typet;" + # Each creation sql shall have one new line per column with a comment: --type which is the postgresql type of that column + # The expected types are programatically extracted from the creation sql and depend on that format + tableData = [ + ("numeric types", + """CREATE TABLE typet ( + s serial, --int4 + z bigserial, --int8 + i smallint, --int2 + j integer, --int4 + i2 int2, --int2 + i4 int4, --int4 + i8 int8, --int8 + k bigint, --int8 + c3 decimal(3), --numeric + n2 numeric(2), --numeric + c33 decimal(3,3), --numeric + n52 numeric(5,2), --numeric + r real, --float4 + d double precision --float8 + ); + """,), + ("char types", + """CREATE TABLE typet ( + v varchar(10), --varchar + w varchar(20), --varchar + x varchar, --varchar + b char, --bpchar + c char(10), --bpchar + d char(20), --bpchar + t text --text + ); + """,), + ("date and time types", + """CREATE TABLE typet ( + ts timestamp, --timestamp + tsp0 timestamp(0), --timestamp + tsp1 timestamp(1), --timestamp + tsz timestamp without time zone, --timestamp + tsz2 timestamp(2) without time zone, --timestamp + tsz3 timestamp(3) without time zone, --timestamp + tss timestamp with time zone, --timestamptz + tss2 timestamp(2) with time zone, --timestamptz + tss3 timestamp(3) with time zone, --timestamptz + i interval, --interval + i0 interval(0), --interval + i4 interval(4), --interval + d date, --date + t time, --time + t0 time(0), --time + t5 time(5), --time + tz time with time zone, --timetz + tz0 time(0) with time zone, --timetz + tz6 time(6) with time zone --timetz + ); + """,), + ("geometric types", + """CREATE TABLE typet ( + pt point, --point + l line, --line + s lseg, --lseg + b box, --box + p path, --path + pg polygon, --polygon + c circle --circle + ); + """,), + ("miscellany", + """CREATE TABLE typet ( + by bytea, --bytea + bo boolean, --bool + c cidr, --cidr + i inet, --inet + m macaddr, --macaddr + b1 bit, --bit + b2 bit(2), --bit + bv bit varying, --varbit + bv3 bit varying(3), --varbit + at1_ text[], --_text + ai1_ integer[], --_int4 + at1_2 text[2], --_text + ai1_3 integer[3], --_int4 + ac2_ char[][], --_bpchar + av2_12 varchar[1][2], --_varchar + av1_3 varchar ARRAY[3] --_varchar + ); + """,), + ] + cursor = self.connection.cursor() + cursor.execute(dropSql) + self.connection.commit() + for tup in tableData: + try: + expected = _extractExpectedFromSql(tup[1]) + cursor.execute(tup[1]) + self.connection.commit() + got = postg.columnNameTypeDictionaryForTable('typet',cursor) + assert expected == got, 'For %s, expected %s, got %s'%(tup[0],expected,got) + finally: + cursor.execute(dropSql) + self.connection.commit() + + def testChildTablesForTable(self): + global me + cursor = self.connection.cursor() + cursor.execute("DROP TABLE IF EXISTS top,second,third,fourth CASCADE") + self.connection.commit() + try: + cursor.execute("CREATE TABLE top (id serial)") + self.connection.commit() + assert [] == postg.childTablesForTable('top',cursor) + cursor.execute("CREATE TABLE second(arity integer) INHERITS (top)") + self.connection.commit() + assert ['second'] == postg.childTablesForTable('top',cursor) + assert [] == postg.childTablesForTable('second',cursor) + cursor.execute("CREATE TABLE third(color text) INHERITS (top)") + self.connection.commit() + assert set(['second','third']) == set(postg.childTablesForTable('top',cursor)) + assert [] == postg.childTablesForTable('second',cursor) + assert [] == postg.childTablesForTable('third',cursor) + cursor.execute("CREATE TABLE fourth(strangeness text) INHERITS (second)") + self.connection.commit() + assert set(['second','third']) == set(postg.childTablesForTable('top',cursor)) + assert ['fourth'] == postg.childTablesForTable('second',cursor) + assert [] == postg.childTablesForTable('third',cursor) + assert [] == postg.childTablesForTable('fourth',cursor) + finally: + cursor.execute("DROP TABLE IF EXISTS top,second,third,fourth CASCADE") + self.connection.commit() + + def testConnectionStatus(self): + global me + cursor = self.connection.cursor() + assert "Status: READY, Transaction Status: IDLE" == postg.connectionStatus(self.connection) + try: + cursor.execute("create table tcon(id integer)") + assert "Status: BEGIN, Transaction Status: INTRANS" == postg.connectionStatus(self.connection) + self.connection.commit() + try: + cursor.execute("select name from tcon") + except: + assert "Status: BEGIN, Transaction Status: INERROR" == postg.connectionStatus(self.connection) + self.connection.rollback() + finally: + cursor.execute("drop table if exists tcon") + self.connection.commit() + + def testGetSequenceNameForColumn(self): + global me + t0 = testTableNames[0] + t1 = testTableNames[1] + cursor = self.connection.cursor() + cursor.execute("CREATE TABLE %s (id SERIAL NOT NULL, junk TEXT)"%t0) + cursor.execute("CREATE TABLE %s (id SERIAL NOT NULL, two SERIAL NOT NULL)"%t1) + self.connection.commit() + got = postg.getSequenceNameForColumn(t1,'id',cursor) + self.connection.commit() + assert "%s_id_seq"%t1 == got, 'Expected "%s_id_seq", got "%s"'%(t1,got) + got = postg.getSequenceNameForColumn(t1,'junk',cursor) + assert None == got,'Expected "None", got "%s"'%(t1,got) + got = postg.getSequenceNameForColumn(t1,'id',cursor) + self.connection.commit() + assert "%s_id_seq"%t1 == got, 'Expected "%s_id_seq", got "%s"'%(t1,got) + got = postg.getSequenceNameForColumn(t1,'two',cursor) + self.connection.commit() + assert "%s_two_seq"%t1 == got, 'Expected "%s_two_seq", got "%s"'%(t1,got) + + def testGetCurrentValue(self): + global me + t0 = testTableNames[0] + t1 = testTableNames[1] + cursor = self.connection.cursor() + cursor.execute("CREATE TABLE %s (id SERIAL NOT NULL, junk TEXT)"%t0) + cursor.execute("CREATE TABLE %s (id integer, two SERIAL NOT NULL)"%t1) + self.connection.commit() + assert None == postg.getCurrentValue(t0,'id',cursor) + self.connection.rollback() + count = 0 + for i in range(1,4): + cursor.execute("INSERT INTO %s (junk) values ('junk')"%t0) + got = postg.getCurrentValue(t0,'id',cursor) + self.connection.commit() + assert i == got, 'Expected %s, got %s'%(i,got) + for i in range(1,4): + cursor.execute("INSERT INTO %s (id) values (%s)"%(t1,i)) + gotId = postg.getCurrentValue(t1,'id',cursor) + gotTwo = postg.getCurrentValue(t1,'two',cursor) + self.connection.commit() + assert None == gotId,'Expected "None", got "%s"'%(gotId) + assert i == gotTwo, 'Expected %s, got %s'%(i,gotTwo) + +def _extractExpectedFromSql(sql): + """Expect newline separated columns with trailing '--type' per line, nothing interesting unless there is a '--' comment""" + ret = {} + cols = sql.split("\n") + for c in cols: + if '--' in c: + cname = c.split()[0] + ctype = c.split('--')[1].strip() + ret[cname] = ctype + + return ret + diff --git a/socorro/unittest/external/filesystem/__init__.py b/socorro/unittest/external/filesystem/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/socorro/unittest/external/filesystem/create_json_dump_store.py b/socorro/unittest/external/filesystem/create_json_dump_store.py new file mode 100644 index 0000000000..6ba7b84b0e --- /dev/null +++ b/socorro/unittest/external/filesystem/create_json_dump_store.py @@ -0,0 +1,115 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import datetime +import errno +import json +import os +import time + + +import socorro.external.filesystem.json_dump_storage as JDS +from socorro.lib.datetimeutil import utc_now, UTC + +jsonFileData = { + '0bba61c5-dfc3-43e7-dead-8afd20071025': ('2007-10-25-05-04','webhead02','0b/ba/61/c5','2007/10/25/05/00/webhead02_0'), + '0bba929f-8721-460c-dead-a43c20071025': ('2007-10-25-05-04','webhead02','0b/ba/92/9f','2007/10/25/05/00/webhead02_0'), + '0b9ff107-8672-4aac-dead-b2bd20081225': ('2008-12-25-05-00','webhead01','0b/9f/f1/07','2008/12/25/05/00/webhead01_0'), + '22adfb61-f75b-11dc-dead-001320081225': ('2008-12-25-05-01','webhead01','22/ad/fb/61','2008/12/25/05/00/webhead01_0'), + 'b965de73-ae90-a935-dead-03ae20081225': ('2008-12-25-05-04','webhead01','b9/65/de/73','2008/12/25/05/00/webhead01_0'), + '0b781b88-ecbe-4cc4-dead-6bbb20081225': ('2008-12-25-05-05','webhead01','0b/78/1b/88','2008/12/25/05/05/webhead01_0'), + '0b8344d6-9021-4db9-dead-a15320081225': ('2008-12-25-05-06','webhead01','0b/83/44/d6','2008/12/25/05/05/webhead01_0'), + '0b94199b-b90b-4683-dead-411420081226': ('2008-12-26-05-21','webhead01','0b/94/19/9b','2008/12/26/05/20/webhead01_0'), + '0b9eedc3-9a79-4ce2-dead-155920081226': ('2008-12-26-05-24','webhead01','0b/9e/ed/c3','2008/12/26/05/20/webhead01_0'), + '0b9fd6da-27e4-46aa-dead-3deb20081226': ('2008-12-26-05-25','webhead02','0b/9f/d6/da','2008/12/26/05/25/webhead02_0'), + '0ba32a30-2476-4724-dead-de17e3081125': ('2008-11-25-05-00','webhead02','0b/a3/2a', '2008/11/25/05/00/webhead02_0'), + '0bad640f-5825-4d42-dead-21b8e3081125': ('2008-11-25-05-04','webhead02','0b/ad/64', '2008/11/25/05/00/webhead02_0'), + '0bae7049-bbff-49f2-dead-7e9fe2081125': ('2008-11-25-05-05','webhead02','0b/ae', '2008/11/25/05/05/webhead02_0'), + '0baf1b4d-dad3-4d35-dead-b9dce2081125': ('2008-11-25-05-06','webhead02','0b/af', '2008/11/25/05/05/webhead02_0'), +} + +jsonMoreData = { + '28adfb61-f75b-11dc-b6be-001320081225': ('2008-12-25-05-01','webhead01','28/ad/fb/61','2008/12/25/05/00'), + '29adfb61-f75b-11dc-b6be-001320081225': ('2008-12-25-05-00','webhead01','29/ad/fb/61','2008/12/25/05/00'), +} + +jsonTooMany = { + '23adfb61-f75b-11dc-b6be-001320081225': ('2008-12-25-05-01','webhead01','23/ad/fb/61','2008/12/25/05/00'), + '24adfb61-f75b-11dc-b6be-001320081225': ('2008-12-25-05-01','webhead01','24/ad/fb/61','2008/12/25/05/00'), + '25adfb61-f75b-11dc-b6be-001320081225': ('2008-12-25-05-02','webhead01','25/ad/fb/61','2008/12/25/05/00'), + '26adfb61-f75b-11dc-b6be-001320081225': ('2008-12-25-05-02','webhead01','26/ad/fb/61','2008/12/25/05/00'), + '27adfb61-f75b-11dc-b6be-001320081225': ('2008-12-25-05-03','webhead01','27/ad/fb/61','2008/12/25/05/00'), + } + + +jsonBadUuid = '66666666-6666-6666-6666-666620081225' + +def getSlot(minsperslot,minute): + """Return the beginning minute of the slot of length minsperslot that contains minute""" + return minsperslot * int(minute/minsperslot) + +def minimalJsonFileContents(dataMap = None): + """ + Generate minimal json file contents encoding by default: + a map of 'ProductName', 'Version' and 'BuildID' + or if dataMap is provided the contents of the map. Note that values in that map MUST be strings that can be formatted to contain a distinguishing integer + """ + if not dataMap: + dataMap = {'ProductName':'bogusName-%02d', + 'Version':'bogusVersion-%02d', + 'BuildID':'bogusBuildID-%02d', + } + cookie = 0 + while True: + retMap = {} + for k,v in dataMap.items(): + retMap[k] = v%cookie + yield json.dumps(retMap) + cookie += 1 + +def createTestSet(testData,jsonKwargs,rootDir): + try: + os.makedirs(rootDir) + except OSError,x: + if errno.EEXIST != x.errno: raise + storage = JDS.JsonDumpStorage(rootDir, **jsonKwargs) + jsonIsEmpty = jsonKwargs.get('jsonIsEmpty', False) + jsonIsBogus = jsonKwargs.get('jsonIsBogus', True) + jsonFileGenerator = jsonKwargs.get('jsonFileGenerator',None) + if 'default' == jsonFileGenerator: + jsonFileGenerator = minimalJsonFileContents() + thedt = utc_now() + for uuid,data in testData.items(): + if data[0].startswith('+'): + if thedt.second >= 58: + print "\nSleeping for %d seconds" %(61-thedt.second) + time.sleep(61-thedt.second) + thedt = utc_now() + slot = { + '+0': getSlot(storage.minutesPerSlot,thedt.minute), + '+5': getSlot(storage.minutesPerSlot,thedt.minute+5), + '+10':getSlot(storage.minutesPerSlot,thedt.minute+10), + } + d3h = '%d/%02d/%02d/%02d/%s' %(thedt.year,thedt.month,thedt.day,thedt.hour,slot[data[0]]) + data[3] = "%s/%s" % (d3h,data[3]) + else: + thedt = datetime.datetime(*[int(x) for x in data[0].split('-')], tzinfo=UTC) + fj,fd = storage.newEntry(uuid,webheadHostName=data[1],timestamp = thedt) + try: + if jsonIsEmpty: + pass + elif jsonIsBogus: + fj.write('json test of %s\n' % uuid) + else: + if jsonFileGenerator: + fileContents = jsonFileGenerator.next() + else: + fileContents = '{"what": "legal json, bad contents", "uuid": "%s\"}\n'% uuid + fj.write(fileContents) + finally: + if fj: fj.close() + try: + fd.write('dump test of %s\n' % uuid) + finally: + if fd: fd.close() diff --git a/socorro/unittest/external/filesystem/test_crash_data.py b/socorro/unittest/external/filesystem/test_crash_data.py new file mode 100644 index 0000000000..5792ff9305 --- /dev/null +++ b/socorro/unittest/external/filesystem/test_crash_data.py @@ -0,0 +1,216 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import shutil +import tempfile +from configman import ConfigurationManager, Namespace +from mock import Mock, patch +from nose.tools import eq_, assert_raises + +from socorro.external import MissingArgumentError, ResourceNotFound, \ + ResourceUnavailable +from socorro.external.filesystem import crash_data, crashstorage +from socorro.unittest.testbase import TestCase + + +class IntegrationTestCrashData(TestCase): + + def setUp(self): + """Insert fake data into filesystem. """ + self.std_tmp_dir = tempfile.mkdtemp() + self.def_tmp_dir = tempfile.mkdtemp() + self.pro_tmp_dir = tempfile.mkdtemp() + + self.config_manager = self._common_config_setup() + + with self.config_manager.context() as config: + store = crashstorage.FileSystemCrashStorage(config.filesystem) + + # A complete crash report (raw, dump and processed) + fake_raw_dump_1 = 'peter is a swede' + fake_raw_dump_2 = 'lars is a norseman' + fake_raw_dump_3 = 'adrian is a frenchman' + fake_dumps = {'upload_file_minidump': fake_raw_dump_1, + 'lars': fake_raw_dump_2, + 'adrian': fake_raw_dump_3} + fake_raw = { + 'name': 'Peter', + 'legacy_processing': 0, + 'submitted_timestamp': '2013-05-04' + } + fake_processed = { + 'name': 'Peter', + 'uuid': '114559a5-d8e6-428c-8b88-1c1f22120314', + 'completeddatetime': '2012-01-01T00:00:00', + 'email': 'peter@fake.org', + } + + store.save_raw_crash( + fake_raw, + fake_dumps, + '114559a5-d8e6-428c-8b88-1c1f22120314' + ) + store.save_processed(fake_processed) + + # A non-processed crash report + fake_raw = { + 'name': 'Adrian', + 'legacy_processing': 0, + 'submitted_timestamp': '2013-05-04' + } + + store.save_raw_crash( + fake_raw, + fake_dumps, + '58727744-12f5-454a-bcf5-f688a2120821' + ) + def tearDown(self): + """Remove all temp files and folders. """ + shutil.rmtree(self.std_tmp_dir) + shutil.rmtree(self.def_tmp_dir) + shutil.rmtree(self.pro_tmp_dir) + + def _common_config_setup(self): + mock_logging = Mock() + required_config = Namespace() + required_config.namespace('filesystem') + required_config.filesystem.filesystem_class = \ + crashstorage.FileSystemCrashStorage + required_config.filesystem.add_option('logger', default=mock_logging) + config_manager = ConfigurationManager( + [required_config], + app_name='testapp', + app_version='1.0', + app_description='app description', + values_source_list=[{'filesystem': { + 'logger': mock_logging, + 'std_fs_root': self.std_tmp_dir, + 'def_fs_root': self.def_tmp_dir, + 'pro_fs_root': self.pro_tmp_dir, + }}], + argv_source=[] + ) + return config_manager + + @patch('socorro.external.rabbitmq.priorityjobs.Priorityjobs') + def test_get(self, priorityjobs_mock): + with self.config_manager.context() as config: + + #priorityjobs_mock = Mock() + service = crash_data.CrashData( + config=config, + all_services={'Priorityjobs': priorityjobs_mock} + ) + params = { + 'datatype': 'raw', + 'uuid': '114559a5-d8e6-428c-8b88-1c1f22120314' + } + + # Test 1: get a raw dump + res_expected = ('peter is a swede', + 'application/octet-stream') + res = service.get(**params) + + eq_(res, res_expected) + + # Test 2: get a raw crash + params['datatype'] = 'meta' + res_expected = { + 'name': 'Peter', + 'legacy_processing': 0, + 'submitted_timestamp': '2013-05-04' + } + res = service.get(**params) + + eq_(res, res_expected) + + # Test 3: get a processed crash + params['datatype'] = 'processed' + res_expected = { + 'name': 'Peter', + 'uuid': '114559a5-d8e6-428c-8b88-1c1f22120314', + 'completeddatetime': '2012-01-01T00:00:00' + } + res = service.get(**params) + + eq_(res, res_expected) + + # Test 3a: get a unredacted processed crash + params['datatype'] = 'unredacted' + res_expected = { + 'name': 'Peter', + 'uuid': '114559a5-d8e6-428c-8b88-1c1f22120314', + 'completeddatetime': '2012-01-01T00:00:00', + 'email': 'peter@fake.org', + } + res = service.get(**params) + + eq_(res, res_expected) + + # Test 4: missing parameters + assert_raises( + MissingArgumentError, + service.get + ) + assert_raises( + MissingArgumentError, + service.get, + **{'uuid': '114559a5-d8e6-428c-8b88-1c1f22120314'} + ) + + # Test 5: crash cannot be found + assert_raises( + ResourceNotFound, + service.get, + **{ + 'uuid': 'c44245f4-c93b-49b8-86a2-c15dc2130504', + 'datatype': 'processed' + } + ) + # Test 5a: crash cannot be found + assert_raises( + ResourceNotFound, + service.get, + **{ + 'uuid': 'c44245f4-c93b-49b8-86a2-c15dc2130504', + 'datatype': 'unredacted' + } + ) + + # Test 6: not yet available crash + assert_raises( + ResourceUnavailable, + service.get, + **{ + 'uuid': '58727744-12f5-454a-bcf5-f688a2120821', + 'datatype': 'processed' + } + ) + priorityjobs_mock.cls.return_value.create.assert_called_once_with( + uuid='58727744-12f5-454a-bcf5-f688a2120821' + ) + priorityjobs_mock.cls.return_value.create.reset_mock() + + # Test 6a: not yet available crash + assert_raises( + ResourceUnavailable, + service.get, + **{ + 'uuid': '58727744-12f5-454a-bcf5-f688a2120821', + 'datatype': 'unredacted' + } + ) + priorityjobs_mock.cls.return_value.create.assert_called_once_with( + uuid='58727744-12f5-454a-bcf5-f688a2120821' + ) + + # Test 7: raw crash cannot be found + assert_raises( + ResourceNotFound, + service.get, + **{ + 'uuid': 'c44245f4-c93b-49b8-86a2-c15dc2130505', + 'datatype': 'raw' + } + ) diff --git a/socorro/unittest/external/filesystem/test_crashstorage.py b/socorro/unittest/external/filesystem/test_crashstorage.py new file mode 100644 index 0000000000..681beae966 --- /dev/null +++ b/socorro/unittest/external/filesystem/test_crashstorage.py @@ -0,0 +1,266 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import os +import os.path +import shutil +import tempfile +import inspect + +from nose.tools import eq_, ok_, assert_raises +from configman import ConfigurationManager +from mock import Mock + +from socorro.external.crashstorage_base import CrashIDNotFound +from socorro.external.filesystem.crashstorage import ( + FileSystemRawCrashStorage, + FileSystemThrottledCrashStorage, + FileSystemCrashStorage) +from socorro.unittest.testbase import TestCase +from socorro.lib.util import DotDict + + +class TestFileSystemCrashStorage(TestCase): + + def setUp(self): + super(TestFileSystemCrashStorage, self).setUp() + self.std_tmp_dir = tempfile.mkdtemp() + self.def_tmp_dir = tempfile.mkdtemp() + self.pro_tmp_dir = tempfile.mkdtemp() + + def tearDown(self): + super(TestFileSystemCrashStorage, self).tearDown() + shutil.rmtree(self.std_tmp_dir) + + @staticmethod + def _get_class_methods(klass): + return dict((n, ref) for (n, ref) + in inspect.getmembers(klass, inspect.ismethod) + if not n.startswith('_') and n in klass.__dict__) + + def _find_file(self, in_, filename): + found = [] + for f in os.listdir(in_): + path = os.path.join(in_, f) + if os.path.isdir(path): + found.extend(self._find_file(path, filename)) + elif os.path.isfile(path) and filename in path: + found.append(path) + return found + + def _common_config_setup(self): + mock_logging = Mock() + required_config = FileSystemCrashStorage.get_required_config() + required_config.add_option('logger', default=mock_logging) + config_manager = ConfigurationManager( + [required_config], + app_name='testapp', + app_version='1.0', + app_description='app description', + values_source_list=[{ + 'logger': mock_logging, + 'std_fs_root': self.std_tmp_dir, + 'def_fs_root': self.def_tmp_dir, + 'pro_fs_root': self.pro_tmp_dir, + }], + argv_source=[] + ) + return config_manager + + def _common_basic_test(self, config, crashstorage): + fake_dump = 'this is a fake dump' + eq_(list(crashstorage.new_crashes()), []) + raw = {"name": "Peter", + "legacy_processing": 0, + "submitted_timestamp": '2012-03-14 15:10:33'} + crashstorage.save_raw_crash( + raw, + fake_dump, + "114559a5-d8e6-428c-8b88-1c1f22120314" + ) + + fake_dumps = {None: 'this is a fake dump', 'aux01': 'aux01 fake dump'} + raw = {"name": "Lars", + "legacy_processing": 0, + "submitted_timestamp": '2012-05-04 15:10:33'} + crashstorage.save_raw_crash( + raw, + fake_dumps, + "114559a5-d8e6-428c-8b88-1c1f22120504" + ) + eq_(sorted(list(crashstorage.new_crashes())), + sorted(["114559a5-d8e6-428c-8b88-1c1f22120314", + "114559a5-d8e6-428c-8b88-1c1f22120504", + ])) + + ok_( + os.path.exists( + crashstorage.std_crash_store.getJson( + '114559a5-d8e6-428c-8b88-1c1f22120314'))) + ok_( + os.path.exists( + crashstorage.std_crash_store.getDump( + '114559a5-d8e6-428c-8b88-1c1f22120314'))) + ok_( + os.path.exists( + crashstorage.std_crash_store.getJson( + '114559a5-d8e6-428c-8b88-1c1f22120504'))) + ok_( + os.path.exists( + crashstorage.std_crash_store.getDump( + '114559a5-d8e6-428c-8b88-1c1f22120504'))) + + meta = crashstorage.get_raw_crash( + '114559a5-d8e6-428c-8b88-1c1f22120314') + ok_(isinstance(meta, DotDict)) + eq_(meta['name'], 'Peter') + + dump = crashstorage.get_raw_dump( + '114559a5-d8e6-428c-8b88-1c1f22120314') + ok_(isinstance(dump, basestring)) + ok_("fake dump" in dump) + + dumps = crashstorage.get_raw_dumps( + '114559a5-d8e6-428c-8b88-1c1f22120504' + ) + eq_(['upload_file_minidump', 'aux01'], dumps.keys()) + eq_(['this is a fake dump', 'aux01 fake dump'], + dumps.values()) + + crashstorage.remove('114559a5-d8e6-428c-8b88-1c1f22120314') + assert_raises(OSError, + crashstorage.std_crash_store.getJson, + '114559a5-d8e6-428c-8b88-1c1f22120314') + assert_raises(OSError, + crashstorage.std_crash_store.getDump, + '114559a5-d8e6-428c-8b88-1c1f22120314') + assert_raises(CrashIDNotFound, + crashstorage.get_raw_crash, + '114559a5-d8e6-428c-8b88-1c1f22120314') + assert_raises(CrashIDNotFound, + crashstorage.get_raw_dump, + '114559a5-d8e6-428c-8b88-1c1f22120314') + + def _common_throttle_test(self, config, crashstorage): + fake_dump = 'this is a fake dump' + crashstorage = FileSystemThrottledCrashStorage(config) + eq_(list(crashstorage.new_crashes()), []) + raw = {"name": "Peter", + "legacy_processing": 1, + "submitted_timestamp": '2012-05-04 15:10:33'} + crashstorage.save_raw_crash( + raw, + fake_dump, + "114559a5-d8e6-428c-8b88-1c1f22120314" + ) + + fake_dumps = {None: 'this is a fake dump', 'aux01': 'aux01 fake dump'} + raw = {"name": "Lars", + "legacy_processing": 0, + "submitted_timestamp": '2012-05-04 15:10:33'} + crashstorage.save_raw_crash( + raw, + fake_dumps, + "114559a5-d8e6-428c-8b88-1c1f22120504" + ) + eq_(list(crashstorage.new_crashes()), + ["114559a5-d8e6-428c-8b88-1c1f22120504",]) + + ok_( + os.path.exists( + crashstorage.def_crash_store.getJson( + '114559a5-d8e6-428c-8b88-1c1f22120314'))) + ok_( + os.path.exists( + crashstorage.def_crash_store.getDump( + '114559a5-d8e6-428c-8b88-1c1f22120314'))) + assert_raises(OSError, + crashstorage.std_crash_store.getJson, + '114559a5-d8e6-428c-8b88-1c1f22120314') + assert_raises(OSError, + crashstorage.std_crash_store.getDump, + '114559a5-d8e6-428c-8b88-1c1f22120314') + + meta = crashstorage.get_raw_crash( + '114559a5-d8e6-428c-8b88-1c1f22120314') + ok_(isinstance(meta, DotDict)) + eq_(meta['name'], 'Peter') + + dump = crashstorage.get_raw_dump( + '114559a5-d8e6-428c-8b88-1c1f22120314') + ok_(isinstance(dump, basestring)) + ok_("fake dump" in dump) + + crashstorage.remove('114559a5-d8e6-428c-8b88-1c1f22120314') + assert_raises(OSError, + crashstorage.def_crash_store.getJson, + '114559a5-d8e6-428c-8b88-1c1f22120314') + assert_raises(OSError, + crashstorage.def_crash_store.getDump, + '114559a5-d8e6-428c-8b88-1c1f22120314') + assert_raises(OSError, + crashstorage.std_crash_store.getJson, + '114559a5-d8e6-428c-8b88-1c1f22120314') + assert_raises(OSError, + crashstorage.std_crash_store.getDump, + '114559a5-d8e6-428c-8b88-1c1f22120314') + + def test_filesystem_raw_crashstorage(self): + config_manager = self._common_config_setup() + with config_manager.context() as config: + crashstorage = FileSystemRawCrashStorage(config) + self._common_basic_test(config, crashstorage) + + def test_filesystem_throttled_crashstorage(self): + config_manager = self._common_config_setup() + with config_manager.context() as config: + crashstorage = FileSystemThrottledCrashStorage(config) + self._common_basic_test(config, crashstorage) + self._common_throttle_test(config, crashstorage) + + def test_filesystem_crashstorage(self): + config_manager = self._common_config_setup() + with config_manager.context() as config: + crashstorage = FileSystemCrashStorage(config) + self._common_throttle_test(config, crashstorage) + + crashstorage = FileSystemCrashStorage(config) + eq_(list(crashstorage.new_crashes()), []) + + processed_crash = {"name": "Peter", "legacy_processing": 1} + assert_raises( + CrashIDNotFound, + crashstorage.save_processed, + processed_crash + ) + processed_crash = { + "name": "Peter", + "uuid": "114559a5-d8e6-428c-8b88-1c1f22120314", + "email": "lars@nowhere.org", + } + expected_processed_crash = { + "name": "Peter", + "uuid": "114559a5-d8e6-428c-8b88-1c1f22120314", + } + crash_id = processed_crash['uuid'] + crashstorage.save_processed(processed_crash) + returned_processed_crash = crashstorage.get_processed(crash_id) + eq_( + expected_processed_crash, + returned_processed_crash + ) + ok_(isinstance(returned_processed_crash, + DotDict)) + returned_processed_crash = \ + crashstorage.get_unredacted_processed(crash_id) + eq_( + processed_crash, + returned_processed_crash + ) + + crashstorage.remove(crash_id) + assert_raises(CrashIDNotFound, + crashstorage.get_processed, + crash_id) + crashstorage.remove(crash_id) diff --git a/socorro/unittest/external/filesystem/test_create_json_dump_store.py b/socorro/unittest/external/filesystem/test_create_json_dump_store.py new file mode 100644 index 0000000000..4e96d01223 --- /dev/null +++ b/socorro/unittest/external/filesystem/test_create_json_dump_store.py @@ -0,0 +1,162 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import socorro.unittest.external.filesystem.create_json_dump_store as createJDS + +import os +import shutil + +from nose.tools import * + +def testGetSlot(): + testData = [ + (0,1,0), + (0,30,0), + (1,5,0), + (1,12,0), + (4,5,0), + (5,5,5), + (29,30,0), + (30,30,30), + (59,5,55), + (59,12,48), + ] + for minutes, size, expected in testData: + got = createJDS.getSlot(size,minutes) + assert expected == got, 'expected %s from getSlot(%s,%s), got %s'%(expected,minute,size,got) + assert_raises(ZeroDivisionError, createJDS.getSlot, 0, 12) + +def testMinimalJsonFileContents(): + testMap = {'first':'a%d'} + gen = createJDS.minimalJsonFileContents(testMap) + for i in range(3): + expected = '{"first": "a%d"}'%i + got = gen.next() + assert expected == got + gen = createJDS.minimalJsonFileContents() + for i in range(3): + expected = '{"BuildID": "bogusBuildID-%02d", "Version": "bogusVersion-%02d", "ProductName": "bogusName-%02d"}'%(i,i,i) + got = gen.next() + assert expected == got + +def testCreateTestSet(): + testDir = "./TEST_CREATE_DIR" + try: + shutil.rmtree(testDir) + except: + pass + assert not os.path.exists(testDir) + try: + createJDS.createTestSet({},{},testDir) + assert os.path.isdir(testDir) + finally: + try: + shutil.rmtree(testDir) + except: + pass + + expected = { + '%s/20071025/date/05'%testDir:(set(['04']), set([])), + '%s/20071025/date'%testDir:(set(['05']), set([])), + '%s/20071025/name/0b/ba/61/c5'%testDir:(set(['0bba61c5-dfc3-43e7-effe-8afd20071025']), set(['0bba61c5-dfc3-43e7-effe-8afd20071025.dump', '0bba61c5-dfc3-43e7-effe-8afd20071025.json'])), + '%s/20071025/name/0b'%testDir:(set(['ba']), set([])), + '%s/20071025/date/05/04'%testDir:(set(['webhead02_0']), set([])), + '%s/20071025/name/0b/ba/61'%testDir:(set(['c5']), set([])), + '%s/20071025'%testDir:(set(['date', 'name']), set([])), + '%s/20071025/date/05/04/webhead02_0'%testDir:(set(['0bba61c5-dfc3-43e7-effe-8afd20071025']), set([])), + '%s/20071025/name'%testDir:(set(['0b']), set([])), + '%s'%testDir:(set(['20071025']), set([])), + '%s/20071025/name/0b/ba'%testDir:(set(['61']), set([])), + } + minSet = {'0bba61c5-dfc3-43e7-effe-8afd20071025': ('2007-10-25-05-04','webhead02','0b/ba/61/c5','2007/10/25/05/00/webhead02_0')} + try: + createJDS.createTestSet(minSet,{},testDir) + got = {} + for dirpath, files, dirs in os.walk(testDir): + got[dirpath] = (set(files),set(dirs)) + if expected != got: + print + for k, v in expected.items(): + print ' X %s: %s'%(k,v) + if k in got: + if got[k] == expected[k]: + print ' G %s: %s'%(k,got[k]) + else: + print 'xx G %s: %s'%(k,got[k]) + else: + print 'xx G %s: (IS MISSING)'%(k) + for k,v in got.items(): + if not k in expected: + print '++ G %s: %s'%(k,v) + assert expected == got + f = open(os.path.join(testDir,'20071025/name/0b/ba/61/c5/0bba61c5-dfc3-43e7-effe-8afd20071025.dump')) + data = f.readlines() + assert 1 == len(data) + assert 'dump test of 0bba61c5-dfc3-43e7-effe-8afd20071025' == data[0].strip() + f.close() + f = open(os.path.join(testDir,'20071025/name/0b/ba/61/c5/0bba61c5-dfc3-43e7-effe-8afd20071025.json')) + data = f.readlines() + assert 1 == len(data) + assert 'json test of 0bba61c5-dfc3-43e7-effe-8afd20071025' == data[0].strip() + f.close() + finally: + try: + shutil.rmtree(testDir) + except: + pass + + try: + createJDS.createTestSet(minSet,{'jsonIsEmpty':True},testDir) + f = open(os.path.join(testDir,'20071025/name/0b/ba/61/c5/0bba61c5-dfc3-43e7-effe-8afd20071025.dump')) + data = f.readlines() + assert 1 == len(data) + assert 'dump test of 0bba61c5-dfc3-43e7-effe-8afd20071025' == data[0].strip() + f.close() + f = open(os.path.join(testDir,'20071025/name/0b/ba/61/c5/0bba61c5-dfc3-43e7-effe-8afd20071025.json')) + data = f.readlines() + assert 0 == len(data) + f.close() + finally: + try: + shutil.rmtree(testDir) + except: + pass + + try: + createJDS.createTestSet(minSet,{'jsonIsBogus':False, 'jsonFileGenerator':'default'},testDir) + f = open(os.path.join(testDir,'20071025/name/0b/ba/61/c5/0bba61c5-dfc3-43e7-effe-8afd20071025.dump')) + data = f.readlines() + assert 1 == len(data) + assert 'dump test of 0bba61c5-dfc3-43e7-effe-8afd20071025' == data[0].strip() + f.close() + f = open(os.path.join(testDir,'20071025/name/0b/ba/61/c5//0bba61c5-dfc3-43e7-effe-8afd20071025.json')) + data = f.readlines() + assert 1 == len(data) + expect='{"BuildID": "bogusBuildID-00", "Version": "bogusVersion-00", "ProductName": "bogusName-00"}' + assert expect == data[0].strip() + f.close() + finally: + try: + shutil.rmtree(testDir) + except: + pass + + try: + createJDS.createTestSet(minSet,{'jsonIsBogus':False},testDir) + f = open(os.path.join(testDir,'20071025/name/0b/ba/61/c5/0bba61c5-dfc3-43e7-effe-8afd20071025.dump')) + data = f.readlines() + assert 1 == len(data) + assert 'dump test of 0bba61c5-dfc3-43e7-effe-8afd20071025' == data[0].strip() + f.close() + f = open(os.path.join(testDir,'20071025/name/0b/ba/61/c5/0bba61c5-dfc3-43e7-effe-8afd20071025.json')) + data = f.readlines() + assert 1 == len(data) + expect='{"what": "legal json, bad contents", "uuid": "0bba61c5-dfc3-43e7-effe-8afd20071025"}' + assert expect == data[0].strip() + f.close() + finally: + try: + shutil.rmtree(testDir) + except: + pass diff --git a/socorro/unittest/external/filesystem/test_dump_storage.py b/socorro/unittest/external/filesystem/test_dump_storage.py new file mode 100644 index 0000000000..82def94aa9 --- /dev/null +++ b/socorro/unittest/external/filesystem/test_dump_storage.py @@ -0,0 +1,391 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import datetime +import logging +import os +import shutil +import sys +import time + +from nose.tools import * + +import socorro.external.filesystem.dump_storage as dumpStorage +import socorro.lib.util as socorro_util + +import socorro.unittest.external.filesystem.create_json_dump_store as createJDS + +from socorro.lib.datetimeutil import utc_now, UTC + +class TestDumpStorage: + def setUp(self): + self.expectedTestDir = os.path.join('.','TEST-DUMP') + self.testDir = self.expectedTestDir+os.sep + self.testData = { + '0bba61c5-dfc3-43e7-dead-8afd22081225': ['0b/ba',datetime.datetime(2008,12,25,12,0, 0, tzinfo=UTC),'12/00'], + '0bba929f-8721-460c-dead-a43c20081225': ['0b/ba/92/9f',datetime.datetime(2008,12,25,12,0, 1, tzinfo=UTC),'12/00'], + '0b9ff107-8672-4aac-dead-b2bd22081225': ['0b/9f',datetime.datetime(2008,12,25,12,0,59, tzinfo=UTC),'12/00'], + '22adfb61-f75b-11dc-dead-001322081225': ['22/ad',datetime.datetime(2008,12,25,12,55,0, tzinfo=UTC),'12/55'], + 'b965de73-ae90-a935-dead-03ae22080101': ['b9/65',datetime.datetime(2008, 1, 1,1,20,31, tzinfo=UTC),'01/20'], + '0b781b88-ecbe-4cc4-dead-6bbb20080203': ['0b/78/1b/88',datetime.datetime(2008, 2, 3, 4,1,45, tzinfo=UTC),'04/00'], + } + self.ctorData = { + 0:{'dateName':'otherDate','logger':logging.getLogger('otherLogger')}, + 1:{'indexName':'otherIndex','minutesPerSlot':10}, + 2:{'minutesPerSlot':'10','dirPermissions':0577}, + 3:{'dumpGID':32111,'subSlotCount':3}, + 4:{} + } + self.expectedCtor = { + 0:{'root':self.expectedTestDir, + 'dateName':'otherDate', + 'indexName':'name', + 'minutesPerSlot':5, + 'dirPermissions':0770, + 'dumpGID':None, + 'logger.name':'otherLogger', + 'subSlotCount': 0, + }, + 1:{'root':self.expectedTestDir, + 'dateName':'date', + 'indexName':'otherIndex', + 'minutesPerSlot':10, + 'dirPermissions':0770, + 'dumpGID':None, + 'logger.name':'dumpStorage', + 'subSlotCount': 0, + }, + 2:{'root':self.expectedTestDir, + 'dateName':'date', + 'indexName':'name', + 'minutesPerSlot':10, + 'dirPermissions':0577, + 'dumpGID':None, + 'logger.name':'dumpStorage', + 'subSlotCount': 0, + }, + 3:{'root':self.expectedTestDir, + 'dateName':'date', + 'indexName':'name', + 'minutesPerSlot':5, + 'dirPermissions':0770, + 'dumpGID':32111, + 'logger.name':'dumpStorage', + 'subSlotCount': 3, + }, + 4:{'root':self.expectedTestDir, + 'dateName':'date', + 'indexName':'name', + 'minutesPerSlot':5, + 'dirPermissions':0770, + 'dumpGID':None, + 'logger.name':'dumpStorage', + 'subSlotCount': 0, + }, + } + + def tearDown(self): + try: + shutil.rmtree(self.testDir) + except OSError: + pass # ok if there is already no such directory + + def testConstructor(self): + for i in range(len(self.expectedCtor)): + if i in (1,3,5): + root = self.expectedTestDir + else: + root = self.testDir + d = dumpStorage.DumpStorage(root,**self.ctorData[i]) + for k in self.expectedCtor[i]: + e = self.expectedCtor[i][k] + g = eval("d."+k) + if type(1) == type(e): + assert e == g,'At loop %d, key %s: Wanted "%0o", got "%0o"'%(i,k,e,g) + else: + assert e == g,'At loop %d, key %s: Wanted "%s", got "%s"'%(i,k,e,g) + + def testNewEntry(self): + # test the default case + d = dumpStorage.DumpStorage(self.testDir) + dateLeafSet = set() + expectedLeafs = set(['55', '00', '20']) + for k,v in self.testData.items(): + nd,dd = d.newEntry(k,v[1]) + dateLeafSet.add(os.path.split(dd)[1]) + assert os.path.isdir(nd) + assert os.path.isdir(dd) + assert os.path.islink(os.path.join(dd,k)) + e = os.path.abspath(nd) + g = os.path.abspath(os.path.join(dd,os.readlink(os.path.join(dd,k)))) + assert e == g,'Expected %s, got %s'%(e,g) + assert expectedLeafs == dateLeafSet, 'Expected %s, got %s'%(expectedLeafs,dateLeafSet) + + # test the for JsonDumpStorage default + d = dumpStorage.DumpStorage(self.testDir,subSlotCount=1) + dateLeafSet = set() + expectedLeafs = set(['55_0', '00_0', '20_0']) + for k,v in self.testData.items(): + nd,dd = d.newEntry(k,v[1]) + dateLeafSet.add(os.path.split(dd)[1]) + assert os.path.isdir(nd) + assert os.path.isdir(dd) + assert os.path.islink(os.path.join(dd,k)) + e = os.path.abspath(nd) + g = os.path.abspath(os.path.join(dd,os.readlink(os.path.join(dd,k)))) + assert e == g,'Expected %s, got %s'%(e,g) + assert expectedLeafs == dateLeafSet, 'Expected %s, got %s'%(expectedLeafs,dateLeafSet) + + # test the trailing _n case at same level + d = dumpStorage.DumpStorage(self.testDir,subSlotCount=3) + dateLeafSet = set() + expectedLeafs = set(['00_0', '20_0', '55_0']) + for k,v in self.testData.items(): + nd,dd = d.newEntry(k,v[1]) + dateLeafSet.add(os.path.split(dd)[1]) + assert os.path.isdir(nd) + assert os.path.isdir(dd) + assert os.path.islink(os.path.join(dd,k)) + e = os.path.abspath(nd) + g = os.path.abspath(os.path.join(dd,os.readlink(os.path.join(dd,k)))) + assert e == g,'Expected %s, got %s'%(e,g) + assert expectedLeafs == dateLeafSet, 'Expected %s, got %s'%(expectedLeafs,dateLeafSet) + + # test with subdirectory further down + d = dumpStorage.DumpStorage(self.testDir,subSlotCount=3) + dateLeafSet = set() + expectedLeafs = set(['wh_0', 'wh_1', 'wh_2']) + for k,v in self.testData.items(): + nd,dd = d.newEntry(k,v[1],webheadName='wh') + dateLeafSet.add(os.path.split(dd)[1]) + assert os.path.isdir(nd) + assert os.path.isdir(dd) + assert os.path.islink(os.path.join(dd,k)) + e = os.path.abspath(nd) + g = os.path.abspath(os.path.join(dd,os.readlink(os.path.join(dd,k)))) + assert e == g,'Expected %s, got %s'%(e,g) + assert expectedLeafs == dateLeafSet, 'Expected %s, got %s'%(expectedLeafs,dateLeafSet) + + def testChownGidVisitor(self): + pass # this is too simple to bother testing + + def testRelativeNameParts(self): + ooid = '12345678-dead-beef-feeb-daed2%d081225' + expected = {1:['12'],2:['12','34'],3:['12','34','56'],0:['12','34','56','78']} + d = dumpStorage.DumpStorage(self.testDir) + for depth in range(4): + tooid = ooid%(depth) + assert expected[depth] == d.relativeNameParts(tooid) + + def testDailyPart(self): + d = dumpStorage.DumpStorage(self.testDir) + testData = [ + ('12345678-dead-beef-feeb-daed20081225',datetime.datetime(2008,12,25,1,2,3, tzinfo=UTC),'20081225'), + ('12345678-dead-beef-feeb-daed20081225',datetime.datetime(2008,12,26,1,2,3, tzinfo=UTC),'20081226'), + ('12345678-dead-beef-feeb-daed20081225',None,'20081225'), + ('',datetime.datetime(2008,12,25,1,2,3, tzinfo=UTC),'20081225'), + (None,None,None), + ('',None,None), + ] + for ooid,date,expected in testData: + if expected: + got = d.dailyPart(ooid,date) + assert expected == got, 'Expected "%s" but got "%s"'%(expected,got) + else: + now = utc_now() + expected = "%4d%02d%02d"%(now.year,now.month,now.day) + assert expected == d.dailyPart(ooid,date), 'From (%s,%s) Expected "%s" but got "%s"'%(ooid,date,expected,got) + def testPathToDate(self): + d = dumpStorage.DumpStorage(self.testDir) + testCases = [ + (['blob','fook','nigl',d.root,'20081211',d.dateName,'10','09_0'],[2008,12,11,10,9]), + (['blob','fook','nigl',d.root,'20081211',d.dateName,'10','09','wh_0'],[2008,12,11,10,9]), + ([d.root,'20081211',d.dateName,'10','09','wh_3'],[2008,12,11,10,9]), + ([d.root,'200z1211',d.dateName,'10','09','wh_3'],None), + ([d.root,'20081g11',d.dateName,'10','09','wh_3'],None), + ([d.root,'2008121-',d.dateName,'10','09','wh_3'],None), + ([d.root,'20081211',d.dateName,'26','09','wh_3'],None), + ([d.root,'20081211',d.dateName,'10','65','wh_3'],None), + ([d.root,'20081311',d.dateName,'10','09','wh_3'],None), + ([d.root,'20081232',d.dateName,'10','09','wh_3'],None), + ] + for (pathInfo,dateParts) in testCases: + path = os.sep.join(pathInfo) + if dateParts: + expected = datetime.datetime(*dateParts, tzinfo=UTC) + got = d.pathToDate(path) + assert expected == got, 'Expected: %s but got %s'%(expected,got) + else: + assert_raises(ValueError,d.pathToDate,path) + + def testLookupNamePath(self): + d = dumpStorage.DumpStorage(self.testDir) + count = 0 + expected ={} + for ooid,v in createJDS.jsonFileData.items(): + dateS = v[0] + if 0 == count%2: + nd,dd = d.newEntry(ooid,datetime.datetime(*[int(x) for x in dateS.split('-')], tzinfo=UTC)) + expected[ooid] = nd + elif 0 == count%5: + expected[ooid] = None + pass + else: + nd,dd = d.newEntry(ooid) + expected[ooid] = nd + count += 1 + for ooid,v in createJDS.jsonFileData.items(): + dateS = v[0] + testDate = datetime.datetime(*[int(x) for x in dateS.split('-')], tzinfo=UTC) + got,ignore = d.lookupNamePath(ooid,testDate) + assert expected[ooid] == got, 'For %s, expected path %s, got %s'%(ooid,expected,got) + + def testNamePath(self): + d = dumpStorage.DumpStorage(self.testDir) + for k,v in self.testData.items(): + g = d.namePath(k,v[1])[0] + e = os.sep.join((d.root,d.dailyPart(k,v[1]),d.indexName,v[0])) + assert e == g, 'Expected "%s", got "%s"'%(e,g) + + def testDatePath(self): + d = dumpStorage.DumpStorage(self.testDir) + for k,v in self.testData.items(): + g = d.datePath(v[1])[0] + e = os.sep.join((d.root,d.dailyPart(k,v[1]),d.dateName,v[2])) + assert e == g, 'Expected "%s", got "%s"'%(e,g) + d = dumpStorage.DumpStorage(self.testDir,subSlotCount=3) + curcount = 0 + for k,v in self.testData.items(): + g = d.datePath(v[1])[0] + e = os.sep.join((d.root,d.dailyPart(k,v[1]),d.dateName,"%s_%d"%(v[2],curcount))) + #curcount = (curcount + 1) % d.subSlotCount + assert e == g, 'Expected "%s", got "%s"'%(e,g) + curcount = 0 + for k,v in self.testData.items(): + g = d.datePath(v[1],webheadName='boot')[0] + e = os.sep.join((d.root,d.dailyPart(k,v[1]),d.dateName,v[2],"%s_%d"%('boot',curcount))) + #curcount = (curcount + 1) % d.subSlotCount + assert e == g, 'Expected "%s", got "%s"'%(e,g) + + def testMakeDateDir(self): + d = dumpStorage.DumpStorage(self.testDir) + d3 = dumpStorage.DumpStorage(self.testDir,subSlotCount=3) + # first test: Make a file of the same name as a subdir and see it fail as expected + testItem = self.testData.items()[0][1] + date = testItem[1] + datePathPart = testItem[2] + while True: + head,tail = os.path.split(datePathPart) + if head == tail: break + dirPart = os.sep.join((d.root,d.dailyPart('',date),d.dateName,head)) + try: + shutil.rmtree(d.root) + except: + pass + filePart = os.path.join(dirPart,tail) + os.makedirs(dirPart) + f = open(filePart,'w') + f.write("nothing\n") + f.close() + assert_raises(OSError,d.makeDateDir,date) + assert_raises(OSError,d3.makeDateDir,date,'boot') + datePathPart = head + try: + shutil.rmtree(d.root) + except: + pass + for k,v in self.testData.items(): + g,dum = d.makeDateDir(v[1]) + e = os.sep.join((d.root, d.dailyPart(k,v[1]),d.dateName,v[2])) + + g0,dum0 = d3.makeDateDir(v[1]) + e0 = os.sep.join((d.root, d.dailyPart(k,v[1]),d.dateName,"%s_%d"%(v[2],0))) + + g3,dum3 = d3.makeDateDir(v[1],'boot') + e3 = os.sep.join((d.root, d.dailyPart(k,v[1]),d.dateName,v[2],"%s_%d"%('boot',0))) + + assert e == g, 'Expected "%s", got "%s"'%(e,g) + assert os.path.isdir(g), 'But "%s" is not a dir'%g + assert e0 == g0, 'Expected "%s", got "%s"'%(e0,g0) + assert os.path.isdir(g0), 'But "%s" is not a dir'%g + assert e3 == g3, 'Expected "%s", got "%s"'%(e3,g3) + assert os.path.isdir(g3), 'But "%s" is not a dir'%g + + def testMakeNameDir(self): + d = dumpStorage.DumpStorage(self.testDir) + # first test: Make a file of the same name and see it fail as expected + testItem = self.testData.items()[0] + testOoid = testItem[0] + testPath = testItem[1][0] + testDate = testItem[1][1] + while True: + head,tail = os.path.split(testPath) + if head == tail: break + dirPart = os.sep.join((d.root,d.dailyPart(testOoid,testDate),d.indexName,head)) + try: + shutil.rmtree(d.root) + except: + pass + filePart = os.path.join(dirPart,tail) + os.makedirs(dirPart) + f = open(filePart,'w') + f.write("nothing\n") + f.close() + assert_raises(OSError,d.makeNameDir,testOoid) + testPath = head + try: + shutil.rmtree(d.root) + except: + pass + + for k,v in self.testData.items(): + g,dum = d.makeNameDir(k) + e = os.path.join(d.root,d.dailyPart(k,v[1]),d.indexName,v[0]) + assert e == g, 'Expected "%s" got "%s"'%(e,g) + + def testLookupOoidInDatePath(self): + d = dumpStorage.DumpStorage(self.testDir) + expected = {} + count = 0 + for ooid,v in createJDS.jsonFileData.items(): + dateS = v[0] + if 0 == count%2: + nd,dd = d.newEntry(ooid,datetime.datetime(*[int(x) for x in dateS.split('-')], tzinfo=UTC)) + expected[ooid] = dd + elif 0 == count%5: + expected[ooid] = None + pass + else: + nd,dd = d.newEntry(ooid) + expected[ooid] = dd + count += 1 + dateS = v[0] + count = 0 + for ooid in createJDS.jsonFileData.keys(): + dateS = v[0] + if expected[ooid]: + exEnd = datetime.datetime(*[int(x) for x in dateS.split('-')], tzinfo=UTC) + passDate = utc_now() + if 0 == count%3: + passDate = None + else: + passDate = exEnd + got,ignore = d.lookupOoidInDatePath(passDate,ooid) + assert expected[ooid] == got, 'For %s: Expected %s, got %s'%(ooid,expected[ooid],got) + + def testReadableOrThrow(self): + d = dumpStorage.DumpStorage + assert_raises(OSError,d.readableOrThrow,self.testDir) + os.mkdir(self.testDir) + tname = '/tmp/someUselessFile_' + d.readableOrThrow(self.testDir) + f = open(tname,'w') + f.write('something') + f.close() + os.chmod(tname,0) + try: + assert_raises(OSError,d.readableOrThrow,tname) + finally: + os.chmod(tname,0200) + os.unlink(tname) diff --git a/socorro/unittest/external/filesystem/test_filesystem.py b/socorro/unittest/external/filesystem/test_filesystem.py new file mode 100644 index 0000000000..8036b54b39 --- /dev/null +++ b/socorro/unittest/external/filesystem/test_filesystem.py @@ -0,0 +1,284 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import os +import shutil +import types + +from nose.tools import * +import socorro.external.filesystem.filesystem as f +from socorro.unittest.testbase import TestCase + +# Describes the directory/file structure we will look at: +# key is a name +# if value is a dictionary, the named item is a directory +# if value is None, the named item is an empty file +# otherwise, the named item is a file holding str(value) +testDir = {'TestDir': + {'f0': 'file TestDir/f0', + 'g0': 'file TestDir/g0', + '0': {'f0a': 'file TestDir/0/f0a', 'f0b': 'file TestDir/0/f0b' }, + '1': {'f1a': None,'f1b': None, + '10': {}, + '11': {}, + }, + '2': {'f2a': None,'f2b':None, + '20': + {'200': + {'2000': + {'d0': 'file TestDir/2/20/200/2000/d0', + 'd1': 'file TestDir/2/20/200/2000/d1', + }, + }, + }, + }, + '4': {'f4': None, + '40': + {'f40':None, + '400': + {'f400':None, + '4000': + {'f4000':None + }, + }, + }, + }, + }, + } + +def acceptDirOnly(t): + return os.path.isdir(t[2]) +def acceptFileOnly(t): + return os.path.isfile(t[2]) +def accept2Dirs(t): + return t[1].startswith('2') +def revcmp(d0,d1): + return cmp(d1,d0) + +class TestFilesystem(TestCase): + def createTestbed(self): + self.deleteTestbed() # just in case + self.createTestDir('.',testDir) + + def createTestDir(self,root,dict): + for k in dict.keys(): + v = dict[k] + if type(v) == types.DictionaryType: + newroot = os.path.join(root,k) + os.mkdir(newroot) + self.createTestDir(newroot,dict.get(k)) + elif type(v) == types.NoneType: + open(os.path.join(root,k),'w').close() + else: + f = open(os.path.join(root,k),'w') + f.write("%s\n" %(v)) + f.close() + + def deleteTestbed(self): + for topLevelDir in testDir.keys(): + if(os.path.exists(os.path.join('.',topLevelDir))): + shutil.rmtree(os.path.join('.',topLevelDir)) + + def setUp(self): + self.createTestbed() + assert 1 == len(testDir.keys()), 'Only one top-level test directory' + self.tdir = testDir.keys()[0] + + def tearDown(self): + self.deleteTestbed() + + def testLevel0(self): + for depth in [ -12,-1,0]: + tst = f.findFileGenerator(self.tdir,maxDepth = depth) + items = [x for x in tst] + assert not items, 'Expect nothing for 0 or negative. For %d, got %s' %(depth,items) + + + def testLevel1(self): + # Look for all top level items regardless of type. + for depth in [1] : + tst = f.findFileGenerator(self.tdir,maxDepth = depth) + items = [] + expected = [ x for x in testDir[self.tdir].keys() ] + for (x,o,p) in tst: + items.append(o) + assert o in expected ,'Item %s must be expected: %s' %(o,expected) + for k in expected: + assert k in items, 'Expected item %s must be found in %s' %(k,items) + + # look for only top level files + items = [] + expected = ['f0','g0'] + t = f.findFileGenerator(self.tdir,acceptanceFunction = acceptFileOnly, maxDepth = depth) + for (x,o,p) in t: + items.append(o) + assert o in expected, 'depth=%d,expect a top level file, got '+o+' not in '+str(expected) % depth + for x in expected: + assert x in items, 'depth=%d,expect both top level files' % depth + + # look for only top level directories + items = [] + expected = ['0','1','2','4'] + t = f.findFileGenerator(testDir.keys()[0],acceptanceFunction = acceptDirOnly, maxDepth = depth) + for (x,o,p) in t: + items.append(o) + assert o in expected, 'depth=%d,expect a top level directory' % depth + for x in expected: + assert x in items, 'depth=%d,expect all top level directories' % depth + + def testLevels(self): + tst = f.findFileGenerator(self.tdir,maxDepth = 2) + items = [] + expected = ['f0a', 'f0b', '0', '10', '11', 'f1a', 'f1b', '1', '20', 'f2a', 'f2b', '2', '40', 'f4', '4', 'f0', 'g0'] + for (x,o,p) in tst: + items.append(o) + assert o in expected + for o in expected: + assert o in items + tst = f.findFileGenerator(self.tdir,maxDepth = 3) + items = [] + expected = ['f0a', 'f0b', '0', '10', '11', 'f1a', 'f1b', '1', '200', '20', 'f2a', 'f2b', '2', '400', 'f40', '40', 'f4', '4', 'f0', 'g0'] + for (x,o,p) in tst: + items.append(o) + assert o in expected + for o in expected: + assert o in items + tst = f.findFileGenerator(self.tdir,maxDepth = 4) + items = [] + expected = ['f0a', 'f0b', '0', '10', '11', 'f1a', 'f1b', '1', '2000', '200', '20', 'f2a', 'f2b', '2', '4000', 'f400', '400', 'f40', '40', 'f4', '4', 'f0', 'g0'] + for (x,o,p) in tst: + items.append(o) + assert o in expected + for o in expected: + assert o in items + tst = f.findFileGenerator(self.tdir,maxDepth = 100) + items = [] + expected = ['f0a', 'f0b', '0', '10', '11', 'f1a', 'f1b', '1', 'd0', 'd1', '2000', '200', '20', 'f2a', 'f2b', '2', 'f4000', '4000', 'f400', '400', 'f40', '40', 'f4', '4', 'f0', 'g0'] + for (x,o,p) in tst: + items.append(o) + assert o in expected + for o in expected: + assert o in items + + def testCompare(self): + #This test won't work for depth > 1 since the directories are visited individually + tst = f.findFileGenerator(self.tdir,maxDepth = 1) + items = [] + for (x,o,p) in tst: + items.append(o) + tst = f.findFileGenerator(self.tdir,maxDepth = 1,directorySortFunction=revcmp) + ritems = [] + for (x,o,p) in tst: + ritems.append(o) + ritems.reverse() + assert(items == ritems) + + def testDirAcceptance(self): + tst = f.findFileGenerator(self.tdir,maxDepth = 100,directoryAcceptanceFunction=accept2Dirs) + items = [] + expected = ['0', '1', 'd0', 'd1', '2000', '200', '20', 'f2a', 'f2b', '2', '4', 'f0', 'g0'] + for (x,o,p) in tst: + items.append(o) + assert o in expected + for o in expected: + assert o in items + + def testFailMakedirsOnFileInPath(self): + path = 'TestDir/1/2/3/4' + tpath = path + while True: + head,tail = os.path.split(tpath) + if tail == 'TestDir': break + try: + shutil.rmtree('TestDir') + except: + pass + f.makedirs(head) + t = open(tpath,'w') + t.write('nothing\n') + t.close() + try: + f.makedirs(path) + assert False, 'We should have had an OSError, but success for %s a file'%tpath + except OSError: + pass + except Exception,x: + assert False, 'We should have had an OSError, got %s: %s'%(type(x),x) + tpath = head + + def testCleanEmptySubdirectories(self): + f.makedirs('TestDir/A/B/C/D') + f.makedirs('TestDir/AA/BB/C') + f.makedirs('TestDir/AA/BB/CC/DD') + fi = open('TestDir/A/a','w') + fi.write('file a\n') + fi.close() + # Test short-circuit path, full stopper + assert os.path.isdir('TestDir/A/B/C/D') + f.cleanEmptySubdirectories('TestDir/A/B/C/D','TestDir/A/B/C/D') + assert os.path.isdir('TestDir/A/B/C/D') + # Test short-circuit path, name stopper + f.cleanEmptySubdirectories('D','TestDir/A/B/C/D') + assert os.path.isdir('TestDir/A/B/C/D') + + # Test some empties, name stopper + f.cleanEmptySubdirectories('C','TestDir/A/B/C/D') + assert not os.path.exists('TestDir/A/B/C/D') + assert os.path.isdir('TestDir/A/B/C') + # Test some empties, path stopper + f.cleanEmptySubdirectories('TestDir/A/B','TestDir/A/B/C') + assert not os.path.exists('TestDir/A/B/C') + assert os.path.isdir('TestDir/A/B') + + #Test stopping on a file in a subdir + f.cleanEmptySubdirectories('TestDir','TestDir/A/B') + assert not os.path.exists('TestDir/A/B') + assert os.path.isdir('TestDir/A') + + #Test stopping on another subdir + f.cleanEmptySubdirectories('TestDir/AA','TestDir/AA/BB/CC/DD') + assert not os.path.exists('TestDir/AA/BB/CC') + assert os.path.isdir('TestDir/AA/BB') + + #Test for stopper not in path + assert_raises(OSError,f.cleanEmptySubdirectories,'Woo','TestDir/AA/BB') + + #Test for non-existent leaf + assert_raises(OSError,f.cleanEmptySubdirectories,'TestDir','TestDir/AA/BB/CC/DD') + + def testVisitPath(self): + f.makedirs('TestDir/a/b/c/d/e/f') + fi = open('TestDir/a/b/c/d/D0','w') + fi.write("hi\n") + fi.close + seen = set() + def collector(x): + seen.add(x) + top = 'TestDir/a' + last = 'TestDir/a/b/c/d' + absTop = os.path.normpath(top) + expected = set([absTop]) + for i in [['b'],['b','c'],['b','c','d']]: + expected.add(os.path.join(absTop,os.sep.join(i))) + f.visitPath(top,last,collector) + assert expected == seen, 'but x-s=%s and s-x=%s'%(expected-seen,seen-expected) + + seen.clear() + top = 'TestDir/a/b' + last = 'TestDir/a/b/c/d/D0' + normTop = os.path.normpath(top) + expected = set([normTop]) + for i in [['c'],['c','d']]: + expected.add(os.path.join(normTop,os.sep.join(i))) + f.visitPath(top,last,collector) + assert expected == seen, 'but x-s=%s and s-x=%s'%(expected-seen,seen-expected) + + #Test for non-existent leaf + assert_raises(OSError,f.visitPath,'TestDir','TestDir/A/BB',collector) + + #Test for rootDir not abover fullPath + assert_raises(OSError,f.visitPath,'TestDir/A/B','TestDir/A',collector) + +if __name__ == "__main__": + unittest.main() diff --git a/socorro/unittest/external/filesystem/test_json_dump_storage.py b/socorro/unittest/external/filesystem/test_json_dump_storage.py new file mode 100644 index 0000000000..92dcd184cd --- /dev/null +++ b/socorro/unittest/external/filesystem/test_json_dump_storage.py @@ -0,0 +1,467 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import os +import shutil +import datetime +import json + +from nose.tools import eq_, ok_, assert_raises + +import socorro.external.filesystem.json_dump_storage as JDS +import socorro.lib.util +from socorro.lib.datetimeutil import UTC + +import socorro.unittest.testlib.createJsonDumpStore as createJDS +from socorro.unittest.testbase import TestCase + + +class TestJsonDumpStorage(TestCase): + def setUp(self): + self.testDir = os.path.join('.','TEST-JSONDUMP')+'/' + self.testMoveTo = os.path.join('.','TEST-MOVETO') + self.testMoveFrom = os.path.join('.','TEST-MOVEFROM') + self.testMoveToAlt = os.path.join('.','TEST-MOVETO-ALT') + fakeLogger = socorro.lib.util.SilentFakeLogger() + self.initKwargs = { + 0:{'logger': fakeLogger}, + 1:{'logger': fakeLogger,'dateName':'by_date','indexName':'by_name','jsonSuffix':'JS','dumpSuffix':'.DS',}, + 2:{'logger': fakeLogger,'jsonSuffix':'JS','dumpSuffix':'.DS',}, + 3:{'logger': fakeLogger,'dateName':'by_date','indexName':'index',}, + } + + self.currenttimes = { + '0bba61c5-dfc3-43e7-dead-8afd20081225': ['+0','webhead02', '0b/ba/61/c5','webhead02_0'], + '0bba929f-8721-460c-dead-a43c20081225': ['+0','webhead02', '0b/ba/92/9f','webhead02_0'], + '0b9ff107-8672-4aac-dead-b2bd20081225': ['+0','webhead01', '0b/9f/f1/07','webhead01_0'], + '22adfb61-f75b-11dc-dead-001320081225': ['+0','webhead01', '22/ad/fb/61','webhead01_0'], + 'b965de73-ae90-a935-dead-03ae20081225': ['+0','webhead01', 'b9/65/de/73','webhead01_0'], + '0b781b88-ecbe-4cc4-dead-6bbb20081225': ['+5','webhead01', '0b/78/1b/88','webhead01_0'], + '0b8344d6-9021-4db9-dead-a15320081225': ['+5','webhead01', '0b/83/44/d6','webhead01_0'], + '0b94199b-b90b-4683-dead-411420081225': ['+5','webhead01', '0b/94/19/9b','webhead01_0'], + '0b9eedc3-9a79-4ce2-dead-155920081225': ['+5','webhead01', '0b/9e/ed/c3','webhead01_0'], + '0b9fd6da-27e4-46aa-dead-3deb20081225': ['+10','webhead02','0b/9f/d6/da','webhead02_0'], + '0ba32a30-2476-4724-dead-de1720081225': ['+10','webhead02','0b/a3/2a/30','webhead02_0'], + '0bad640f-5825-4d42-dead-21b820081225': ['+10','webhead02','0b/ad/64/0f','webhead02_0'], + '0bae7049-bbff-49f2-dead-7e9f20081225': ['+10','webhead02','0b/ae/70/49','webhead02_0'], + '0baf1b4d-dad3-4d35-dead-b9dc20081225': ['+10','webhead02','0b/af/1b/4d','webhead02_0'], + } + + try: + shutil.rmtree(self.testDir) + except OSError: + pass # ok if there is no such test directory + os.mkdir(self.testDir) + + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[2]) + self.dump_field = storage.dump_field + self.dumpSuffix = storage.dumpSuffix + + def tearDown(self): + try: + shutil.rmtree(self.testDir) + except OSError: + pass # ok if there is no such test directory + try: + shutil.rmtree(self.testMoveTo) + except OSError: + pass + try: + shutil.rmtree(self.testMoveFrom) + except OSError: + pass + try: + shutil.rmtree(self.testMoveToAlt) + except OSError: + pass + + def __getSlot(self,minsperslot,minute): + return minsperslot * int(minute/minsperslot) + + def __hasLinkOrFail(self,jsonStorage,uuid): + linkPath = jsonStorage.getJson(uuid)[:-len(jsonStorage.jsonSuffix)] + try: + os.readlink(linkPath) + except Exception,x: + assert False, '(%s:%s) Expected to be able to readlink from %s'%(type(x),x,linkPath) + def __hasNoLinkOrFail(self,jsonStorage,uuid): + linkPath = jsonStorage.getJson(uuid)[:-len(jsonStorage.jsonSuffix)] + try: + os.readlink(linkPath) + assert False, 'Expected to find no link: %s '%linkPath + except OSError,x: + assert 2 == x.errno, "Expected errno=2, got %d for linkpath %s"%(x.errno,linkPath) + except Exception,x: + assert False, "Expected OSError, got %s for linkpath %s"%(x,linkPath) + + def __hasDatePathOrFail(self,jsonStorage,ooid,dt): + dpath,dpathParts = jsonStorage.lookupOoidInDatePath(dt,ooid) + assert os.path.isdir(dpath), 'Expect %s is a directory'%(dpath) + + def __relativeDateParts(self,dateString,minutesPerSlot): + """ given "YYYY-mm-dd-hh-mm", return [hh,slot]""" + hh,mm = dateString.split('-')[-2:] + slot = int(mm) - int(mm)%minutesPerSlot + return [hh,"%02d"%slot] + + def testConstructor(self): + self.constructorAlt(self.testDir,**self.initKwargs[0]) + self.constructorAlt(self.testDir,**self.initKwargs[1]) + self.constructorAlt(self.testDir,**self.initKwargs[2]) + self.constructorAlt(self.testDir,**self.initKwargs[3]) + + def constructorAlt(self,*args,**kwargs): + storage = JDS.JsonDumpStorage(self.testDir,**kwargs) + assert storage.dateName == kwargs.get('dateName','date'),'From kwargs=%s'%kwargs + assert storage.indexName == kwargs.get('indexName','name'),'From kwargs=%s'%kwargs + assert storage.jsonSuffix == '.'+kwargs.get('jsonSuffix','json'),'We will always pass non-dot json suffix. From kwargs=%s'%kwargs + assert storage.dumpSuffix == kwargs.get('dumpSuffix','.dump'),'We will always pass dot dump suffix. From kwargs=%s'%kwargs + assert self.testDir.rstrip(os.sep) == storage.root,'From kwargs=%s'%kwargs + + def testNewEntry(self): + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[2]) + for uuid,data in createJDS.jsonFileData.items(): + datetimedata = [int(x) for x in data[0].split('-')] + uuid = ''.join((uuid[:-7],'2',uuid[-6:])) + stamp = datetime.datetime(*datetimedata, tzinfo=UTC) + try: + fj,fd = storage.newEntry(uuid,webheadHostName=data[1],timestamp = stamp) + except IOError: + assert False, 'Expect to succeed with newEntry(%s,...)' % uuid + + assert fj, 'Expect a non-null json file handle from newEntry(%s,...)' % uuid + loc2 = data[2][0:5] # We are not honoring ooid depth + expectFileBase = os.sep.join((storage.root, storage.dailyPart('',stamp), storage.indexName,loc2)) + expectJson = os.path.join(expectFileBase,uuid+storage.jsonSuffix) + assert expectJson == fj.name, 'For %s, expect %s, got %s' % (uuid,expectJson,fj.name) + assert fd, 'Expect a non-null dump file handle from newEntry(%s,...)' % uuid + expectDump = os.path.join(expectFileBase,uuid+storage.dumpSuffix) + assert expectDump == fd.name, 'For %s, expect %s, got %s' % (uuid,expectDump,fj.name) + loc3parts = self.__relativeDateParts(data[0],storage.minutesPerSlot) + loc3parts.append(data[3][-len('webhead0x_x'):]) + loc3 = os.sep.join(loc3parts) + lbase = os.sep.join((storage.root, storage.dailyPart('',stamp), storage.dateName, loc3)) + lpath = os.path.join(lbase,uuid) + assert os.path.islink(lpath), 'Expect a link from timed to storage for %s' % uuid + relNamePath = os.path.join(lbase,os.readlink(lpath)) + assert os.path.isdir(relNamePath), 'Expected %s to be a Name directory'%(relNamePath) + lpath = os.path.join(expectFileBase,uuid) + assert os.path.islink(lpath), 'Expect link from name storage to timed for %s' % uuid + relDatePath = os.path.join(expectFileBase,os.readlink(lpath)) + assert os.path.isdir(relDatePath), 'Expected %s to be a Date directory'%(relDatePath) + try: + try: + fj.write("testing\n") + assert True, 'must be able to write to the json file for uuid %s' % uuid + except: + assert False, 'must not fail to write to the json file for uuid %s' % uuid + finally: + if fj: fj.close() + + try: + try: + fd.write("testing\n") + assert True, 'must be able to write to the dump file for uuid %s' % uuid + except: + assert False, 'must not fail to write to the dump file for uuid %s' % uuid + finally: + if fd: fd.close() + + def testGetJson(self): + createJDS.createTestSet(createJDS.jsonFileData, self.initKwargs[0],self.testDir) + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[0]) + for uuid,data in createJDS.jsonFileData.items(): + dateparts = data[0].split('-') + daily = "%4d%02d%02d"%tuple([int(x) for x in dateparts[:3]]) + expected = os.sep.join((storage.root,daily,storage.indexName,data[2],uuid+storage.jsonSuffix)) + got = storage.getJson(uuid) + assert expected == got, 'Expected json file %s, got %s' % (expected,got) + try: + storage.getJson(createJDS.jsonBadUuid) + assert False, 'Expect to throw IOError from attempt to getJson(non-existent-uuid)' + except OSError,e: + assert True, 'Got expected error from attempt to getJson(non-existent-uuid)' + except Exception, e: + assert False, 'Got unexpected error %s from attempt to getJson(non-existent-uuid' % e + + def testGetDump(self): + createJDS.createTestSet(createJDS.jsonFileData,self.initKwargs[1],self.testDir) + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[1]) + for uuid,data in createJDS.jsonFileData.items(): + dateparts = data[0].split('-') + daily = "%4d%02d%02d"%tuple([int(x) for x in dateparts[:3]]) + expected = os.sep.join((storage.root,daily,storage.indexName,data[2],uuid+storage.dumpSuffix)) + got = storage.getDump(uuid) + assert expected == got, 'Expected dump file %s, got %s' % (expected,got) + try: + storage.getDump(createJDS.jsonBadUuid) + assert False, 'Should throw IOError from attempt to getDumpAsFile(non-existent-uuid)' + except OSError,e: + assert True + except Exception, e: + assert False, 'Got unexpected error(type) %s from attempt to getDumpAsFile(non-existent-uuid' % e + + def markAsSeen(self): + createJDS.createTestSet(createJDS.jsonFileData,self.initKwargs[3],self.testDir) + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[3]) + for uuid,data in createJDS.jsonFileData.items(): + assert os.path.islink(os.sep.join((storage.dateBranch,data[3],uuid))), 'Expect a link from date to name for %s' % uuid + assert os.path.islink(os.sep.join((storage.nameBranch,data[2],uuid))), 'Expect link from name to timed for %s' % uuid + assert not os.path.islink(os.sep.join((storage.dateBranch,data[3],uuid))), 'Expect no link from date to name for %s' % uuid + assert not os.path.islink(os.sep.join((storage.nameBranch,data[2],uuid))), 'Expect no link from name to date for %s' % uuid + try: + storage.markAsSeen(createJDS.jsonBadUuid) + assert False, 'Expect to throw IOError from attempt to openAndMarkAsSeen(non-existent-uuid)' + except IOError: + assert True, 'Got expected error from attempt to openAndMarkAsSeen(non-existent-uuid)' + except Exception, e: + assert False, 'Got unexpected error %s from attempt to openAndMarkAsSeen(non-existent-uuid' % e + assert not os.listdir(storage.dateBranch), 'Expect empty, got %s' % os.listdir(storage.dateBranch) + + def testDestructiveDateWalk(self): + createJDS.createTestSet(createJDS.jsonFileData,self.initKwargs[0],self.testDir) + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[0]) + uuids = createJDS.jsonFileData.keys() + #uuidsSet = set(uuids) + seenids = set() + for id in storage.destructiveDateWalk(): + assert id in uuids, 'Expect that %s is among the uuids we stored\n%s' % (id,uuids) + seenids.add(id) + for id in uuids: + assert id in seenids, 'Expect that we found every uuid we stored (%s) from %s' % (id,seenids) + daily = os.listdir(storage.root) + for d in daily: + assert not storage.dateName in os.listdir(os.path.join(storage.root,d)), 'Expected all date subdirs to be gone, but %s'%d + + def testMarkAsSeen(self): + """testNewJsonDumpStorage:TestJsonDumpStorage.testMarkAsSeen() + somewhat bogus test: Doesn't look for failure modes + """ + createJDS.createTestSet(createJDS.jsonFileData,self.initKwargs[0],rootDir=self.testDir) + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[2]) + for ooid in createJDS.jsonFileData.keys(): + namePath,parts = storage.namePath(ooid) + linkInName = os.path.join(namePath,ooid) + assert os.path.islink(linkInName), 'expected %s as link'%linkInName + dpath = os.path.join(namePath,os.readlink(linkInName)) + linkInDate = os.path.join(dpath,ooid) + assert os.path.islink(linkInDate), 'expected %s as link'%linkInDate + storage.markAsSeen(ooid) + assert not os.path.exists(linkInName), 'expected %s gone'%linkInName + assert not os.path.exists(linkInDate), 'expected %s gone'%linkInDate + + def testDestructiveDateWalkNotNow(self): + createJDS.createTestSet(self.currenttimes,self.initKwargs[1],self.testDir) + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[1]) + seenids = [] + for id in storage.destructiveDateWalk(): + seenids.append(id) + assert [] == seenids + + def testRemove(self): + createJDS.createTestSet(createJDS.jsonFileData,self.initKwargs[2],self.testDir) + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[2]) + counter = 0 + for uuid in createJDS.jsonFileData.keys(): + if 0 == counter % 3: + # test that we don't throw for missing links + storage.markAsSeen(uuid) + if 1 == counter % 3: + # test that we don't throw for one missing file + if 0 == counter % 2: + os.unlink(storage.getDump(uuid)) + else: + os.unlink(storage.getJson(uuid)) + if 2 == counter % 3: + # test that we don't throw for both missing files, but with links + os.unlink(storage.getJson(uuid)) + os.unlink(storage.getDump(uuid)) + storage.remove(uuid) + counter += 1 + allfiles = [] + alllinks = [] + for dir, dirs, files in os.walk(self.testDir): + for file in files: + allfiles.append(file) + if os.path.islink(os.path.join(dir,file)): + alllinks.append(file) + for d in dirs: + if os.path.islink(os.path.join(dir,d)): + alllinks.append(d) + + assert [] == allfiles, 'Expect that all removed files are gone, but found %s' % allfiles + assert [] == alllinks, 'Expect that all links are gone, but found %s' % alllinks + assert_raises(JDS.NoSuchUuidFound, storage.remove, "bogusdatax3yymmdd") + + def testRemoveAlsoNames(self): + """testJsonDumpStorage:TestJsonDumpStorage.testRemoveAlsoNames(self) + Try to remove them all, and check that they are indeed all gone. + """ + createJDS.createTestSet(createJDS.jsonFileData,self.initKwargs[2],self.testDir) + kwargs = self.initKwargs[2] + kwargs['cleanIndexDirectories'] = 'True' + storage = JDS.JsonDumpStorage(self.testDir,**kwargs) + for uuid,data in createJDS.jsonFileData.items(): + storage.remove(uuid) + assert not os.listdir(storage.root), 'Expected them all to go, but %s'%(os.listdir(storage.root)) + + def testRemoveRemovesOnlyDate(self): + createJDS.createTestSet(createJDS.jsonFileData,self.initKwargs[2],self.testDir) + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[2]) + dailies = set([]) + expectedSubs = [] + alldirs = [] + allfiles = [] + alllinks = [] + for uuid,data in createJDS.jsonFileData.items(): + dailies.add(''.join(data[0].split('-')[:3])) + storage.remove(uuid) + for day in dailies: + for dir, dirs, files in os.walk(os.sep.join((storage.root,day,storage.dateName))): + for file in files: + allfiles.append(file) + if os.path.islink(os.path.join(dir,file)): + alllinks.append(file) + for d in dirs: + if os.path.islink(os.path.join(dir,d)): + alllinks.append(d) + alldirs.append(os.path.join(dir,d)) + assert [] == allfiles, 'Expect that all removed files are gone, but found %s' % allfiles + assert [] == alllinks, 'Expcet that all links are gone, but found %s' % alllinks + assert [] == alldirs, 'Expect that all date dirs are gone, but found %s' % alldirs + + for day in dailies: + for dir,dirs,files in os.walk(os.sep.join((storage.root,day,storage.indexName))): + for file in files: + allfiles.append(file) + if os.path.islink(os.path.join(dir,file)): + alllinks.append(file) + for d in dirs: + if os.path.islink(os.path.join(dir,d)): + alllinks.append(d) + alldirs.append(os.path.join(dir,d)) + assert [] == allfiles, 'Expect that all removed files are gone, but found %s' % allfiles + assert [] == alllinks, 'Expect that all links are gone, but found %s' % alllinks + for sub in expectedSubs: + assert sub in alldirs, "Expect each subdirectory is still there, but didn't find %s" % sub + + def testRemoveWithBadlyFormattedDateLink(self): + createJDS.createTestSet(createJDS.jsonFileData,self.initKwargs[2],self.testDir) + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[2]) + uuid = createJDS.jsonFileData.keys()[0] + head, json_unused = os.path.split(storage.getJson(uuid)) + target = os.readlink(os.path.join(head,uuid)) + idx = target.index('/date/') + target = "%s%s%s" %(target[:idx+6],target[idx+7:idx+10],target[idx+10:]) + os.unlink(os.path.join(head,uuid)) + os.symlink(target,os.path.join(head,uuid)) + #print "LINK:%s"%(os.readlink(os.path.join(head,uuid))) + # assure that we don't throw for a badly formatted path + storage.remove(uuid) + + def _create_multidump_data(self): + return [ ('0bba61c5-dfc3-3333-dead-8afd20081225', + {'ProductName': 'X', + 'submitted_timestamp': '2012-12-15 11:23:45'}, + {'upload_file_minidump': 'fake main dump', + 'aux_dump_0': 'fake aux_dump_0', + 'aux_dump_1': 'fake aux_dump_1'}), + ('0bba61c5-dfc3-43e7-dead-8afd20081225', + {'ProductName': 'Y', + 'submitted_timestamp': '2012-12-15 20:13:33'}, + {'upload_file_minidump': '2nd fake main dump', + 'aux_dump_0': '2nd fake aux_dump_0', + 'aux_dump_1': '2nd fake aux_dump_1'}), + ] + + def _expected_dump_path(self, base_path, crash_id, dump_name): + if dump_name == self.dump_field or dump_name is None: + return '%s/%s%s' % (base_path, crash_id, self.dumpSuffix) + else: + return '%s/%s.%s%s' % (base_path, crash_id, dump_name, + self.dumpSuffix) + + + def test_store_and_retrieve_multidumps(self): + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[2]) + input_crashes = self._create_multidump_data() + for crash_id, raw_crash, dumps_dict in input_crashes: + storage.new_entry(crash_id, raw_crash, dumps_dict) + + for crash_id, raw_crash, dumps_dict in input_crashes: + base_path, parts_unused = storage.lookupNamePath(crash_id) + pathname = "%s/%s%s" % (base_path, crash_id, storage.jsonSuffix) + eq_(storage.getJson(crash_id), pathname) + with open(pathname) as f: + d = json.load(f) + eq_(d, raw_crash) + dump_paths = storage.get_dumps(crash_id) + for a_dump_name, a_dump_contents in dumps_dict.iteritems(): + dump_pathname = self._expected_dump_path(base_path, crash_id, + a_dump_name) + ok_(dump_pathname in dump_paths.values()) + with open(dump_pathname) as f: + read_contents = f.read() + eq_(read_contents, a_dump_contents) + + + def test_getDump_with_names(self): + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[2]) + input_crashes = self._create_multidump_data() + for crash_id, raw_crash, dumps_dict in input_crashes: + storage.new_entry(crash_id, raw_crash, dumps_dict) + + crash_id = '0bba61c5-dfc3-43e7-dead-8afd20081225' + base_path, parts_unused = storage.lookupNamePath(crash_id) + + expected_dump_path = self._expected_dump_path(base_path, crash_id, None) + actual_dump_path = storage.getDump(crash_id) + eq_(expected_dump_path, actual_dump_path) + + expected_dump_path = self._expected_dump_path(base_path, crash_id, None) + actual_dump_path = storage.getDump(crash_id, storage.dump_field) + eq_(expected_dump_path, actual_dump_path) + + expected_dump_path = self._expected_dump_path(base_path, crash_id, + 'aux_dump_0') + actual_dump_path = storage.getDump(crash_id, 'aux_dump_0') + eq_(expected_dump_path, actual_dump_path) + + expected_dump_path = self._expected_dump_path(base_path, crash_id, + 'aux_dump_1') + actual_dump_path = storage.getDump(crash_id, 'aux_dump_1') + eq_(expected_dump_path, actual_dump_path) + + assert_raises(OSError, storage.getDump, + crash_id, 'does_not_exist') + + def test_remove(self): + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[2]) + input_crashes = self._create_multidump_data() + for crash_id, raw_crash, dumps_dict in input_crashes: + storage.new_entry(crash_id, raw_crash, dumps_dict) + + crash_id = '0bba61c5-dfc3-3333-dead-8afd20081225' + storage.remove(crash_id) + + assert_raises(OSError, storage.getDump, crash_id) + assert_raises(OSError, storage.get_dumps, crash_id) + + def test_quickDelete(self): + storage = JDS.JsonDumpStorage(self.testDir,**self.initKwargs[2]) + input_crashes = self._create_multidump_data() + for crash_id, raw_crash, dumps_dict in input_crashes: + storage.new_entry(crash_id, raw_crash, dumps_dict) + + crash_id = '0bba61c5-dfc3-3333-dead-8afd20081225' + storage.quickDelete(crash_id) + + assert_raises(OSError, storage.getDump, crash_id) + assert_raises(OSError, storage.get_dumps, crash_id) + + + + +if __name__ == "__main__": + unittest.main() diff --git a/socorro/unittest/external/filesystem/test_json_dump_storage_permissions.py b/socorro/unittest/external/filesystem/test_json_dump_storage_permissions.py new file mode 100644 index 0000000000..45c85d6472 --- /dev/null +++ b/socorro/unittest/external/filesystem/test_json_dump_storage_permissions.py @@ -0,0 +1,63 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import datetime as DT +import os +import os.path as p +import shutil +import stat +import time + +import socorro.external.filesystem.json_dump_storage as JDS +import uuid as socorro_uuid +from socorro.lib.util import SilentFakeLogger +import socorro.external.filesystem.filesystem as socorro_fs +from socorro.unittest.testbase import TestCase +from socorro.lib.datetimeutil import UTC + +class TestJsonDumpStoragePermissions(TestCase): + def setUp(self): + self.testDir = os.path.join('/tmp', 'TESTPERM') + self.testMoveFrom = os.path.join('/tmp', 'TESTPERM-MOVEFROM') + def tearDown(self): + try: + shutil.rmtree(self.testDir) + except OSError: + pass # ok if there is no such directory + try: + shutil.rmtree(self.testMoveFrom) + except OSError: + pass + + + def testNewEntryPermissions(self): + dirPermissions=0707 + dumpPermissions=0500 + sfl = SilentFakeLogger() + j = JDS.JsonDumpStorage(root=self.testDir,dirPermissions=dirPermissions,dumpPermissions=dumpPermissions,logger=sfl) + u = str(socorro_uuid.uuid1()) + f1, f2 = j.newEntry(u) + f1.close() + f2.close() + jpath = j.getJson(u) + gotPermissions = stat.S_IMODE(os.stat(jpath)[0]) + assert stat.S_IMODE(os.stat(jpath)[0]) == dumpPermissions, "%s: Expected %o, got %o" % (jpath, dumpPermissions, gotPermissions) + + dpath = j.getDump(u) + gotPermissions = stat.S_IMODE(os.stat(dpath)[0]) + assert stat.S_IMODE(os.stat(dpath)[0]) == dumpPermissions, "%s: Expected %o, got %o" % (dpath, dumpPermissions, gotPermissions) + + udir = os.path.split(dpath)[0] + datePath = os.path.abspath(os.path.join(udir,os.readlink(os.path.splitext(dpath)[0]))) + namePath = os.path.abspath(os.path.splitext(dpath)[0]) + topPath = os.path.abspath(self.testDir) + dailies = os.listdir(topPath) + + def assertPermVisitor(p): + gotPerm = stat.S_IMODE(os.stat(p)[0]) + assert dirPermissions == gotPerm, "%s: Expected %0o, got %0o"%(p,dirPermissions,gotPerm) + for d in dailies: + # visitPath quietly ignores a file as the leaf + socorro_fs.visitPath(os.path.join(topPath,d),datePath,assertPermVisitor) + socorro_fs.visitPath(os.path.join(topPath,d),namePath,assertPermVisitor) diff --git a/socorro/unittest/external/filesystem/test_processed_dump_storage.py b/socorro/unittest/external/filesystem/test_processed_dump_storage.py new file mode 100644 index 0000000000..86b4e6fe29 --- /dev/null +++ b/socorro/unittest/external/filesystem/test_processed_dump_storage.py @@ -0,0 +1,266 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import datetime as datetime +import gzip +import os +import shutil +import time + +from nose.tools import * + +import socorro.unittest.testlib.createJsonDumpStore as createJDS + +import socorro.lib.util as socorro_util +import socorro.external.filesystem.processed_dump_storage as dumpStorage +from socorro.lib.datetimeutil import utc_now, UTC +from socorro.unittest.testbase import TestCase + + +bogusData= { + "signature": "nsThread::ProcessNextEvent(int, int*)", + "uuid": "not in my back yard", + "date_processed": "2009-03-31 14:45:09.215601", + "install_age": 100113, + "uptime": 7, + "last_crash": 95113, + "product": "Thunderbird", + "version": "3.0b2", + "build_id": "20090223121634", + "branch": "1.9.1", + "os_name": "Mac OS X", + "os_version": "10.5.6 9G55", + "cpu_name": "x86", + "cpu_info": "GenuineIntel family 6 model 15 stepping 6", + "crash_reason": "EXC_BAD_ACCESS / KERN_INVALID_ADDRESS", + "crash_address": "0xe9b246", + "User Comments": "This thing crashed.\nHelp me Kirk.", + "app_notes": "", + "success": True, + "truncated": True, + "processor_notes": "", + "distributor":"", + "distributor_version": "", + "dump":"OS|Mac OS X|10.5.6 9G55\nCPU|x86|GenuineIntel family 6 model 15 stepping 6|2\nCrash|EXC_BAD_ACCESS / KERN_INVALID_ADDRESS|0xe9b246|0\nModule|thunderbird-bin||thunderbird-bin|57E2541E130E4A6ABA7A66E16DD0F79F0|0x00001000|0x00c40fff|1\nModule|Cocoa||Cocoa|E064F94D969CE25CB7DE3CFB980C32490|0x00d7a000|0x00d7afff|0\nModule|libmozjs.dylib||libmozjs.dylib|F69DA57AFA0A404880BF4A765E9E09090|0x00d7e000|0x00e39fff|0\nModule|libxpcom.dylib||libxpcom.dylib|55F6A143264C4B8EADBB7D789C8905260|0x00e56000|0x00e56fff|0\nModule|libxpcom_core.dylib||libxpcom_core.dylib|2CC80C82B4304EA0B757E6A0E3CCB8130|0x00e5b000|0x00ec8fff|0\nModule|libplds4.dylib||libplds4.dylib|7A7A59FAC31B48F0A76B00220784A3CE0|0x00eec000|0x00ef1fff|0\nModule|libplc4.dylib||libplc4.dylib|54972A9756C042A0803B41260DDC16F90|0x00ef6000|0x00efbfff|0\nModule|libnspr4.dylib||libnspr4.dylib|1EA68C7035DF4D47B8E3365DD16AF2480|0x00f01000|0x00f27fff|0\nModule|SystemConfiguration||SystemConfiguration|8B26EBF26A009A098484F1ED01EC499C0|0x00f37000|0x00f6efff|0\nModule|Carbon||Carbon|98A5E3BC0C4FA44BBB09713BB88707FE0|0x00f8d000|0x00f8dfff|0\nModule|AddressBook||AddressBook|60DDAE72A1DF8DDBC5C53DF92F372B760|0x00f91000|0x01110fff|0\nModule|QuickTime||QuickTime|BC0920ABBBAAD03F5513AC7FFBD306330|0x011ec000|0x01511fff|0\nModule|IOKit||IOKit|F9F5F0D070E197A832D86751E1D445450|0x015d7000|0x01662fff|0\nModule|libcrypto.0.9.7.dylib||libcrypto.0.9.7.dylib|69BC2457AA23F12FA7D052601D48FA290|0x01688000|0x0173afff|0\nModule|libcups.2.dylib||libcups.2.dylib|16BEC7C6A004F744804E2281A1B1C0940|0x01789000|0x017b1fff|0\nModule|CoreAudio||CoreAudio|F35477A5E23DB0FA43233C37DA01AE1C0|0x017bc000|0x01839fff|0\nModule|AudioToolbox||AudioToolbox|E1BBA7B890E8B8EEC3E3EE900773B7710|0x0188e000|0x019e0fff|0\nModule|AudioUnit||AudioUnit|880380CB87BE2B31914A5934EB3BA6BA0|0x01a55000|0x01a55fff|0\nModule|libsmime3.dylib||libsmime3.dylib|382327EE00224E038C8C1FDD0012DEF50|0x01a5a000|0x01a6ffff|0\nModule|libssl3.dylib||libssl3.dylib|37026155F0D04D6E983982A0AD28E35D0|0x01a7c000|0x01aa3fff|0\nModule|libnss3.dylib||libnss3.dylib|C96CB0108FC14F458CCD31FC6943D4FF0|0x01aae000|0x01b7cfff|0\nModule|libnssutil3.dylib||libnssutil3.dylib|CAF0F4B360B2437194780F72B09E66E40|0x01ba6000|0x01bb2fff|0\nModule|libsoftokn3.dylib||libsoftokn3.dylib|60D20BEB4EF949518B089C1AF864F8670|0x01bbc000|0x01be4fff|0\nModule|libldap60.dylib||libldap60.dylib|F8737FDA25F94DE783A9ABBC9975A8F90|0x01bed000|0x01c15fff|0\nModule|libprldap60.dylib||libprldap60.dylib|703DD9D1527041C0BB089FBCAD19F1F70|0x01c21000|0x01c26fff|0\nModule|libldif60.dylib||libldif60.dylib|42C22A365F58422D9BDE93C75761F2370|0x01c2c000|0x01c30fff|0\nModule|libsqlite3.dylib||libsqlite3.dylib|778D1BC8256143779AB80CCB9D42C8A70|0x01c35000|0x01c96fff|0\nModule|libstdc++.6.dylib||libstdc++.6.dylib|04B812DCEC670DAA8B7D2852AB14BE600|0x01c9e000|0x01cfbfff|0\nModule|libgcc_s.1.dylib||libgcc_s.1.dylib|F53C808E87D1184C0F9DF63AEF53CE0B0|0x01d4c000|0x01d53fff|0\nModule|libSystem.B.dylib||libSystem.B.dylib|D68880DFB1F8BECDBDAC6928DB1510FB0|0x01d59000|0x01ec0fff|0\nModule|AppKit||AppKit|A3A300499BBE4F1DFEBF71D752D019160|0x01f4f000|0x0274dfff|0\nModule|CoreData||CoreData|8E28162EF2288692615B52ACC01F8B540|0x02c8e000|0x02d73fff|0\nModule|ApplicationServices||ApplicationServices|8F910FA65F01D401AD8D04CC933CF8870|0x02dee000|0x02deefff|0\nModule|DesktopServicesPriv||DesktopServicesPriv|D16642BA22C32F67BE793EBFBE67CA3A0|0x02df6000|0x02e80fff|0\nModule|Foundation||Foundation|8FE77B5D15ECDAE1240B4CB604FC6D0B0|0x02ecb000|0x03146fff|0\nModule|HIToolbox||HIToolbox|3747086BA21EE419708A5CAB946C8BA60|0x032a4000|0x035acfff|0\nModule|QuartzCore||QuartzCore|2FED2DD7565C84A0F0C608D41D4D172C0|0x0370a000|0x03aa7fff|0\nModule|Security||Security|55DDA7486DF4E8E1D61505BE16F83A1C0|0x03ba1000|0x03d6ffff|0\nModule|SpeechRecognition||SpeechRecognition|D3180F9EDBD9A5E6F283D6156AA3C6020|0x03eb4000|0x03ebdfff|0\nModule|libauto.dylib||libauto.dylib|42D8422DC23A18071869FDF7B5D8FAB50|0x03ec7000|0x03ef2fff|0\nModule|libicucore.A.dylib||libicucore.A.dylib|18098DCF431603FE47EE027A60006C850|0x03f00000|0x04038fff|0\nModule|libxml2.2.dylib||libxml2.2.dylib|D69560099D9EB32BA7F8A17BAA65A28D0|0x0408c000|0x0416dfff|0\nModule|libz.1.dylib||libz.1.dylib|5DDD8539AE2EBFD8E7CC1C57525385C70|0x0419a000|0x041a8fff|0\nModule|CoreUI||CoreUI|676FAF4FF6DDDBDD7D716FCE0E59349A0|0x041ae000|0x041e8fff|0\nModule|DiskArbitration||DiskArbitration|75B0C8D8940A8A27816961DDDCAC8E0F0|0x04208000|0x04210fff|0\nModule|CoreServices||CoreServices|2FCC8F3BD5BBFC000B476CAD8E6A3DD20|0x0421a000|0x0421afff|0\nModule|libobjc.A.dylib||libobjc.A.dylib|7B92613FDF804FD9A0A3733A0674C30B0|0x04222000|0x04302fff|0\nModule|CoreFoundation||CoreFoundation|4A70C8DBB582118E31412C53DC1F407F0|0x04374000|0x044a7fff|0\nModule|ATS||ATS|8C51DE0EC3DEAEF416578CD59DF387540|0x0459f000|0x04632fff|0\nModule|ColorSync||ColorSync|FD78C64B42F804AE9B0BAE75AAD2C5100|0x04659000|0x04724fff|0\nModule|CoreGraphics||CoreGraphics|3A91D1037AFDE01D1D8ACDF9CD1CAA140|0x04765000|0x04e05fff|0\nModule|CoreText||CoreText|F9A90116AE34A2B0D84E87734766FB3A0|0x04ed5000|0x04f2ffff|0\nModule|HIServices||HIServices|01B690D1F376E400AC873105533E39EB0|0x04f6f000|0x04fc0fff|0\nModule|ImageIO||ImageIO|6A6623D3D1A7292B5C3763DCD108B55F0|0x04fea000|0x05130fff|0\nModule|LangAnalysis||LangAnalysis|8B7831B5F74A950A56CF2D22A2D436F60|0x05188000|0x05198fff|0\nModule|QD||QD|B743398C24C38E581A86E91744A2BA6E0|0x051a5000|0x0524cfff|0\nModule|SpeechSynthesis||SpeechSynthesis|06D8FC0307314F8FFC16F206AD3DBF440|0x05275000|0x05285fff|0\nModule|CarbonCore||CarbonCore|F06FE5D92D56AC5AA52D1BA1827459240|0x05294000|0x0556efff|0\nModule|CFNetwork||CFNetwork|80851410A5592B7C3B149B2FF849BCC10|0x055d8000|0x05675fff|0\nModule|Metadata||Metadata|E0572F20350523116F23000676122A8D0|0x056ed000|0x05736fff|0\nModule|OSServices||OSServices|2A135D4FB16F4954290F7B72B4111AA30|0x05752000|0x0580cfff|0\nModule|SearchKit||SearchKit|3140A605DB2ABF56B237FA156A08B28B0|0x05871000|0x058f0fff|0\nModule|AE||AE|4CB9EF65CF116D6DD424F0CE98C2D0150|0x05933000|0x05962fff|0\nModule|LaunchServices||LaunchServices|6F9629F4ED1BA3BB313548E6838B28880|0x0597a000|0x05a06fff|0\nModule|DictionaryServices||DictionaryServices|AD0AA0252E3323D182E17F50DEFE56FC0|0x05a4c000|0x05a62fff|0\nModule|libmathCommon.A.dylib||libmathCommon.A.dylib|D75DC85A7C3CA075A24E7252869B76600|0x05a74000|0x05a78fff|0\nModule|libbsm.dylib||libbsm.dylib|D25C63378A5029648FFD4B4669BE31BF0|0x05a7c000|0x05a83fff|0\nModule|libsqlite3.0.dylib||libsqlite3.0.dylib|6978BBCCA4277D6AE9F042BEFF643F7D0|0x05a8a000|0x05b11fff|0\nModule|libxslt.1.dylib||libxslt.1.dylib|0A9778D6368AE668826F446878DEB99B0|0x05b1e000|0x05b42fff|0\nModule|Accelerate||Accelerate|274CA63B852C0701F86FDB679198FDDB0|0x05b4c000|0x05b4cfff|0\nModule|vImage||vImage|2A2C9E354B6491A892802B0BD97F1CC80|0x05b50000|0x05c17fff|0\nModule|vecLib||vecLib|274CA63B852C0701F86FDB679198FDDB0|0x05c27000|0x05c27fff|0\nModule|libvMisc.dylib||libvMisc.dylib|2C407027985293C0B174294688D390650|0x05c2b000|0x05ca8fff|0\nModule|libvDSP.dylib||libvDSP.dylib|B232C018DDD040EC4E2C2AF632DD497F0|0x05cb6000|0x05ce3fff|0\nModule|libBLAS.dylib||libBLAS.dylib|3769D952F2378FCA4FCCAA61527C8ACF0|0x05cef000|0x060fffff|0\nModule|libLAPACK.dylib||libLAPACK.dylib|9B0ED359D604DC6CA6389560C0BC679F0|0x06145000|0x06503fff|0\nModule|libJPEG.dylib||libJPEG.dylib|E7EB56555109E23144924CD64AA8DAEC0|0x06539000|0x06558fff|0\nModule|libTIFF.dylib||libTIFF.dylib|3589442575AC77746AE99ECF724F5F870|0x06560000|0x0659ffff|0\nModule|libGIF.dylib||libGIF.dylib|572A32E46E33BE1EC041C5EF5B0341AE0|0x065aa000|0x065aefff|0\nModule|libPng.dylib||libPng.dylib|4780E979D35AA5EC2CEA22678836CEA50|0x065b4000|0x065cffff|0\nModule|libRadiance.dylib||libRadiance.dylib|8A844202FCD65662BB9AB25F08C45A620|0x065d7000|0x065d9fff|0\nModule|libresolv.9.dylib||libresolv.9.dylib|A8018C42930596593DDF27F7C20FE7AF0|0x065de000|0x065fcfff|0\nModule|vecLib||vecLib|274CA63B852C0701F86FDB679198FDDB0|0x06606000|0x06606fff|0\nModule|InstallServer||InstallServer|A0358A24A32E1E9813A1575185B3398F0|0x0660a000|0x0660afff|0\nModule|CarbonSound||CarbonSound|0F2BA6E891D3761212CF5A5E6134D6830|0x0660e000|0x06618fff|0\nModule|OpenGL||OpenGL|7E5048A2677B41098C84045305F42F7F0|0x06621000|0x0662efff|0\nModule|libGLImage.dylib||libGLImage.dylib|1123B8A48BCBE9CC7AA8DD8E1A214A660|0x06636000|0x06674fff|0\nModule|libffi.dylib||libffi.dylib|A3B573EB950CA583290F7B2B4C486D090|0x0667e000|0x0667ffff|0\nModule|CoreVideo||CoreVideo|C0D869876AF51283A160CD2224A23ABF0|0x06684000|0x0669cfff|0\nModule|libGLU.dylib||libGLU.dylib|7C4BC24ABDD4C859788D6874F906D5190|0x066b0000|0x06709fff|0\nModule|libGL.dylib||libGL.dylib|AB2164E7650463E7167B603B325B409C0|0x0671d000|0x06729fff|0\nModule|libGLProgrammability.dylib||libGLProgrammability.dylib|5D283543AC844E7C6FA3440AC56CD2650|0x06737000|0x06c08fff|0\nModule|CommonPanels||CommonPanels|EA0665F57CD267609466ED8B2B20E8930|0x06d35000|0x06d3afff|0\nModule|Help||Help|B507B08E484CB89033E9CF23062D77DE0|0x06d43000|0x06d46fff|0\nModule|HTMLRendering||HTMLRendering|FE87A9DEDE38DB00E6C8949942C6BD4F0|0x06d4c000|0x06da8fff|0\nModule|ImageCapture||ImageCapture|0C71CF9C4A8D4A4A763DC52E7C4703870|0x06dd6000|0x06debfff|0\nModule|Ink||Ink|BF3FA8927B4B8BAAE92381A976FD20790|0x06e06000|0x06e99fff|0\nModule|NavigationServices||NavigationServices|91844980804067B07A0B6124310D3F310|0x06eb8000|0x06efafff|0\nModule|OpenScripting||OpenScripting|572C7452D7E740E8948A5AD07A99602B0|0x06f28000|0x06f40fff|0\nModule|SecurityHI||SecurityHI|2B2854123FED609D1820D2779E2E09630|0x06f52000|0x06f54fff|0\nModule|DirectoryService||DirectoryService|F8931F64103C8A86B82E9714352F43230|0x06f5a000|0x06f78fff|0\nModule|LDAP||LDAP|CC04500CF7B6EDCCC75BB3FE2973F72C0|0x06f85000|0x06fb7fff|0\nModule|DSObjCWrappers||DSObjCWrappers|09DEB9E32D0D09DFB95AE569BDD2B7A40|0x06fc2000|0x06fd1fff|0\nModule|Backup||Backup|60FDC2CDE17C2689677F2DCFD592407D0|0x06fdf000|0x06fe4fff|0\nModule|libsasl2.2.dylib||libsasl2.2.dylib|BB7971CA2F609C070F87786A93D1041E0|0x06fee000|0x06ffdfff|0\nModule|libssl.0.9.7.dylib||libssl.0.9.7.dylib|C7359B7AB32B5F8574520746E10A41CC0|0x07005000|0x07029fff|0\nModule|libalerts_s.dylib||libalerts_s.dylib|129E11DF8E9E4DABA8DA0F4E41769CEB0|0x07308000|0x07313fff|0\nModule|Unicode Encodings||Unicode Encodings|542F2B8930D6BDF16C318FFEA541ACAB0|0x07395000|0x07396fff|0\nModule|libSimplifiedChineseConverter.dylib||libSimplifiedChineseConverter.dylib|548D5A699DBE2BB8FCC8275321FDC0D40|0x073ae000|0x073bcfff|0\nModule|HelpData||HelpData|28D5C89696B963716210925D91D4A26D0|0x073e4000|0x073f0fff|0\nModule|Shortcut||Shortcut|057783867138902B52BC0941FEDB74D10|0x07500000|0x07528fff|0\nModule|libCGATS.A.dylib||libCGATS.A.dylib|386DCE4B28448FB86E33E06AC466F4D80|0x077c6000|0x077cdfff|0\nModule|libRIP.A.dylib||libRIP.A.dylib|5D0B5AF7992E14DE017F9A9C7CB059600|0x1f885000|0x1f8c6fff|0\nModule|libCSync.A.dylib||libCSync.A.dylib|E6ACEED359BD228F42BC1246AF5919C90|0x1f8d3000|0x1f8defff|0\nModule|Kerberos||Kerberos|685CC018C133668D0D3AC6A1CB63CFF90|0x20000000|0x200b0fff|0\nModule|libnssdbm3.dylib||libnssdbm3.dylib|2312FD8609554909A693BA9248E6E8420|0x213de000|0x213fafff|0\nModule|libfreebl3.dylib||libfreebl3.dylib|85DA857F16A34E85A40D3042EE4866AA0|0x21469000|0x214c9fff|0\nModule|libnssckbi.dylib||libnssckbi.dylib|A68FB6ED4263406096AD73E9A2D2B5C40|0x214d1000|0x21509fff|0\nModule|RawCamera||RawCamera|FEA6D22F985AEC2F376D937291B54ECC0|0x222c0000|0x22492fff|0\nModule|Print||Print|8BF7EF71216376D12FCD5EC17E43742C0|0x64b00000|0x64b06fff|0\nModule|libJapaneseConverter.dylib||libJapaneseConverter.dylib|7B0248C392848338F5D6ED093313EEEF0|0xba900000|0xba916fff|0\nModule|PrintCore||PrintCore|222DADE7B33B99708B8C09D1303F93FC0|0xfa100000|0xfa17afff|0\n\n0|0|libxpcom_core.dylib|nsThread::ProcessNextEvent(int, int*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:xpcom/threads/nsThread.cpp:c1141fd20875|510|0x5\n0|1|libxpcom_core.dylib|NS_ProcessPendingEvents_P(nsIThread*, unsigned int)|nsThreadUtils.cpp|180|0x16\n0|2|thunderbird-bin|nsBaseAppShell::NativeEventCallback()|hg:hg.mozilla.org/releases/mozilla-1.9.1:widget/src/xpwidgets/nsBaseAppShell.cpp:c1141fd20875|121|0x17\n0|3|thunderbird-bin|nsAppShell::ProcessGeckoEvents(void*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:widget/src/cocoa/nsAppShell.mm:c1141fd20875|374|0x7\n0|4|CoreFoundation||||0x735f4\n0|5|CoreFoundation||||0x73cd7\n0|6|HIToolbox||||0x302bf\n0|7|HIToolbox||||0x300d8\n0|8|HIToolbox||||0x2ff4c\n0|9|AppKit||||0x40d7c\n0|10|AppKit||||0x4062f\n0|11|AppKit||||0x3966a\n0|12|thunderbird-bin|nsAppShell::Run()|hg:hg.mozilla.org/releases/mozilla-1.9.1:widget/src/cocoa/nsAppShell.mm:c1141fd20875|693|0x79\n0|13|thunderbird-bin|nsAppStartup::Run()|hg:hg.mozilla.org/releases/mozilla-1.9.1:toolkit/components/startup/src/nsAppStartup.cpp:c1141fd20875|192|0x7\n0|14|thunderbird-bin|XRE_main|hg:hg.mozilla.org/releases/mozilla-1.9.1:toolkit/xre/nsAppRunner.cpp:c1141fd20875|3279|0x7\n0|15|thunderbird-bin|main|/builds/releases/3.0b2/2/comm-central/mail/app/nsMailApp.cpp|103|0x18\n0|16|thunderbird-bin||||0x2105\n0|17|thunderbird-bin||||0x202c\n0|18|||||0x1\n1|0|libSystem.B.dylib||||0x3830a\n1|1|libnspr4.dylib|poll|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/md/unix/unix.c:c1141fd20875|3672|0x2f\n1|2|libnspr4.dylib|_pr_poll_with_poll|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptio.c:c1141fd20875|3916|0x15\n1|3|libnspr4.dylib|PR_Poll|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptio.c:c1141fd20875|4318|0x18\n1|4|thunderbird-bin|nsSocketTransportService::Poll(int, unsigned int*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:netwerk/base/src/nsSocketTransportService2.cpp:c1141fd20875|355|0xf\n1|5|thunderbird-bin|nsSocketTransportService::DoPollIteration(int)|hg:hg.mozilla.org/releases/mozilla-1.9.1:netwerk/base/src/nsSocketTransportService2.cpp:c1141fd20875|660|0x18\n1|6|thunderbird-bin|nsSocketTransportService::OnProcessNextEvent(nsIThreadInternal*, int, unsigned int)|hg:hg.mozilla.org/releases/mozilla-1.9.1:netwerk/base/src/nsSocketTransportService2.cpp:c1141fd20875|539|0xf\n1|7|libxpcom_core.dylib|nsThread::ProcessNextEvent(int, int*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:xpcom/threads/nsThread.cpp:c1141fd20875|497|0x25\n1|8|libxpcom_core.dylib|NS_ProcessNextEvent_P(nsIThread*, int)|nsThreadUtils.cpp|227|0x15\n1|9|thunderbird-bin|nsSocketTransportService::Run()|hg:hg.mozilla.org/releases/mozilla-1.9.1:netwerk/base/src/nsSocketTransportService2.cpp:c1141fd20875|581|0x12\n1|10|libxpcom_core.dylib|nsThread::ProcessNextEvent(int, int*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:xpcom/threads/nsThread.cpp:c1141fd20875|510|0x7\n1|11|libxpcom_core.dylib|NS_ProcessNextEvent_P(nsIThread*, int)|nsThreadUtils.cpp|227|0x15\n1|12|libxpcom_core.dylib|nsThread::ThreadFunc(void*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:xpcom/threads/nsThread.cpp:c1141fd20875|254|0xf\n1|13|libnspr4.dylib|_pt_root|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptthread.c:c1141fd20875|221|0x8\n1|14|libSystem.B.dylib||||0x32094\n1|15|libSystem.B.dylib||||0x31f51\n2|0|libSystem.B.dylib||||0x1226\n2|1|libSystem.B.dylib||||0x7daae\n2|2|libnspr4.dylib|pt_TimedWait|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptsynch.c:c1141fd20875|280|0x18\n2|3|libnspr4.dylib|PR_WaitCondVar|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptsynch.c:c1141fd20875|407|0x17\n2|4|libxpcom_core.dylib|TimerThread::Run()|hg:hg.mozilla.org/releases/mozilla-1.9.1:xpcom/threads/TimerThread.cpp:c1141fd20875|345|0xe\n2|5|libxpcom_core.dylib|nsThread::ProcessNextEvent(int, int*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:xpcom/threads/nsThread.cpp:c1141fd20875|510|0x7\n2|6|libxpcom_core.dylib|NS_ProcessNextEvent_P(nsIThread*, int)|nsThreadUtils.cpp|227|0x15\n2|7|libxpcom_core.dylib|nsThread::ThreadFunc(void*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:xpcom/threads/nsThread.cpp:c1141fd20875|254|0xf\n2|8|libnspr4.dylib|_pt_root|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptthread.c:c1141fd20875|221|0x8\n2|9|libSystem.B.dylib||||0x32094\n2|10|libSystem.B.dylib||||0x31f51\n3|0|libSystem.B.dylib||||0x120e\n3|1|libSystem.B.dylib||||0x78538\n3|2|libnspr4.dylib|PR_WaitCondVar|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptsynch.c:c1141fd20875|405|0x10\n3|3|libnspr4.dylib|PR_Wait|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptsynch.c:c1141fd20875|584|0x11\n3|4|libxpcom_core.dylib|nsEventQueue::GetEvent(int, nsIRunnable**)|../../dist/include/xpcom/nsAutoLock.h|340|0xf\n3|5|libxpcom_core.dylib|nsThread::ProcessNextEvent(int, int*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:xpcom/threads/nsThread.h:c1141fd20875|112|0x15\n3|6|libxpcom_core.dylib|NS_ProcessNextEvent_P(nsIThread*, int)|nsThreadUtils.cpp|227|0x15\n3|7|libxpcom_core.dylib|nsThread::ThreadFunc(void*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:xpcom/threads/nsThread.cpp:c1141fd20875|254|0xf\n3|8|libnspr4.dylib|_pt_root|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptthread.c:c1141fd20875|221|0x8\n3|9|libSystem.B.dylib||||0x32094\n3|10|libSystem.B.dylib||||0x31f51\n4|0|libSystem.B.dylib||||0x1226\n4|1|libSystem.B.dylib||||0x7daae\n4|2|libnspr4.dylib|pt_TimedWait|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptsynch.c:c1141fd20875|280|0x18\n4|3|libnspr4.dylib|PR_WaitCondVar|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptsynch.c:c1141fd20875|407|0x17\n4|4|libnspr4.dylib|PR_Wait|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptsynch.c:c1141fd20875|584|0x11\n4|5|libxpcom_core.dylib|nsThreadPool::Run()|../../dist/include/xpcom/nsAutoLock.h|340|0xd\n4|6|libxpcom_core.dylib|nsThread::ProcessNextEvent(int, int*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:xpcom/threads/nsThread.cpp:c1141fd20875|510|0x7\n4|7|libxpcom_core.dylib|NS_ProcessNextEvent_P(nsIThread*, int)|nsThreadUtils.cpp|227|0x15\n4|8|libxpcom_core.dylib|nsThread::ThreadFunc(void*)|hg:hg.mozilla.org/releases/mozilla-1.9.1:xpcom/threads/nsThread.cpp:c1141fd20875|254|0xf\n4|9|libnspr4.dylib|_pt_root|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptthread.c:c1141fd20875|221|0x8\n4|10|libSystem.B.dylib||||0x32094\n4|11|libSystem.B.dylib||||0x31f51\n5|0|libSystem.B.dylib||||0x1226\n5|1|libSystem.B.dylib||||0x7daae\n5|2|libnspr4.dylib|pt_TimedWait|hg:hg.mozilla.org/releases/mozilla-1.9.1:nsprpub/pr/src/pthreads/ptsynch.c:c1141fd20875|280|0x18" + } + +class TestProcessedDumpStorage(TestCase): + def setUp(self): + self.testDir = os.path.join('.','TEST-DUMPSTORAGE')+'/' + fakeLogger = socorro_util.SilentFakeLogger() + self.initKwargs = { + 0:{'logger': fakeLogger,}, + 1:{'logger': fakeLogger,'fileSuffix':'DSgz',}, + 2:{'logger': fakeLogger,'fileSuffix':'.DSgz',}, + 3:{'logger': fakeLogger,'gzipCompression':'3',}, + 4:{'logger': fakeLogger,'storageDepth':'3',}, + 5:{'logger': fakeLogger,'rootName':'someRoot', 'dateName':'someDate', 'minutesPerSlot':'12'} + } + + try: + shutil.rmtree(self.testDir) + except OSError: + pass # ok if there is no such test directory + os.mkdir(self.testDir) + + def tearDown(self): + pass + try: + shutil.rmtree(self.testDir) + except OSError: + pass # ok if there is no such test directory + + def dailyFromNow(self): + + return ''.join(utc_now().date().isoformat().split('-')) + + def dailyFromDate(self,dateString): + """given "YYYY-mm-dd-hh-mm" return YYYYmmdd string""" + return ''.join(dateString.split('-')[:3]) + + def relativeDateParts(self,dateString,minutesPerSlot): + """ given "YYYY-mm-dd-hh-mm", return [hh,slot]""" + hh,mm = dateString.split('-')[-2:] + slot = int(mm) - int(mm)%minutesPerSlot + return [hh,"%02d"%slot] + def hourSlotFromNow(self,minutesPerSlot): + hh,mm = utc_now().isoformat('T').split('T')[1].split(':')[:2] + slot = int(mm) - int(mm)%minutesPerSlot + return hh,"%02d"%slot + + def testConstructor(self): + self.constructorAlt(self.testDir,**self.initKwargs[0]) + self.constructorAlt(self.testDir,**self.initKwargs[1]) + self.constructorAlt(self.testDir,**self.initKwargs[2]) + self.constructorAlt(self.testDir,**self.initKwargs[3]) + self.constructorAlt(self.testDir,**self.initKwargs[4]) + + def constructorAlt(self,*args,**kwargs): + storage = dumpStorage.ProcessedDumpStorage(self.testDir,**kwargs) + assert self.testDir.rstrip(os.sep) == storage.root,'From kwargs=%s'%kwargs + assert storage.indexName == kwargs.get('indexName','name'),'From kwargs=%s'%kwargs + suffix = kwargs.get('fileSuffix','.jsonz') + if not suffix.startswith('.'):suffix = '.%s'%suffix + assert suffix == storage.fileSuffix,'expected "%s", got "%s" From kwargs=%s'%(suffix,storage.fileSuffix,kwargs) + compression = int(kwargs.get('gzipCompression','9')) + assert compression == storage.gzipCompression + storageDepth = int(kwargs.get('storageDepth',2)) + assert storageDepth == storage.storageDepth,'Expected %s, got %s'%(storageDepth,storage.storageDepth) + mps = int(kwargs.get('minutesPerSlot',1)) + assert mps == storage.minutesPerSlot,'Expected %s, got %s'%(mps,storage.minutesPerSlot) + + def testNewEntry(self): + storage = dumpStorage.ProcessedDumpStorage(self.testDir,**self.initKwargs[0]) + for ooid,(tdate,wh,pathprefix,longDatePath) in createJDS.jsonFileData.items(): + dailyPart = ''.join(tdate.split('-')[:3]) + expectedDir = os.sep.join((storage.root,dailyPart,storage.indexName,pathprefix)) + expectedPath = os.path.join(expectedDir,"%s%s"%(ooid,storage.fileSuffix)) + hourPart,slot = self.relativeDateParts(tdate,storage.minutesPerSlot) + datepart = "%s_0"%(os.path.join(hourPart,slot)) + expectedDateDir = os.sep.join((storage.root,dailyPart,storage.dateName,datepart)) + testStamp = datetime.datetime(*[int(x) for x in tdate.split('-')],tzinfo=UTC) + fh = None + try: + fh = storage.newEntry(ooid,testStamp) + fh.write(expectedPath) + finally: + fh.close() + assert os.path.exists(expectedPath), 'Expected: gzip file %s but none there'%(expectedPath) + try: + fh = gzip.open(expectedPath) + firstline = fh.readline() + assert expectedPath == firstline, 'Expected this file to contain its own path, but %s'%firstline + nextline = fh.readline() + assert '' == nextline, 'Expected this file to contain ONLY its own path, but %s'%nextline + finally: + fh.close() + dToN = os.path.join(expectedDateDir,ooid) + assert os.path.islink(dToN),'Expected %s to be link exists:%s'%(dToN,os.path.exists(dToN)) + datapath = os.readlink(os.path.join(expectedDateDir,ooid)) + # The next lines prove we have a relative-path link + zigpath = os.path.join(expectedDateDir,datapath) + assert os.path.isfile(os.path.join(zigpath,"%s%s"%(ooid,storage.fileSuffix))) + assert os.path.pardir in zigpath,'But zigpath has no "parent directory" parts?: %s'%(zigpath) + + def testPutDumpToFile(self): + """ + testPutDumpToFile(self):(slow=2) + """ + storage = dumpStorage.ProcessedDumpStorage(self.testDir,**self.initKwargs[2]) + ooid = '0bae7049-bbff-49f2-dead-7e9fe2081125' # is coded for depth 2, so no special thought needed + data = createJDS.jsonFileData[ooid] + stamp = datetime.datetime(*[int(x) for x in data[0].split('-')],tzinfo=UTC) + expectedPath = os.sep.join((storage.root,self.dailyFromNow(),storage.indexName,data[2])) + expectedFile = os.path.join(expectedPath,ooid+storage.fileSuffix) + assert not os.path.exists(expectedPath), 'Better not exist at start of test' + data = {"header":"header","data":['line ONE','lineTWO','last line']} + now = utc_now() + if now.second > 57: + time.sleep(60-now.second) + now = utc_now() + storage.putDumpToFile(ooid,data,now) # default timestamp + datePath = None + seenDirs = set() + seenFiles = set() + for dirpath, dirnames, filenames in os.walk(storage.root): + for f in filenames: + if f.startswith(ooid): + seenFiles.add(os.path.join(dirpath,f)) + for d in dirnames: + if d.startswith(ooid): + seenDirs.add(os.path.join(dirpath,d)) + + for p in seenFiles: + assert storage.fileSuffix in p + assert storage.indexName in p + for p in seenDirs: + assert ooid == os.path.split(p)[1] + assert storage.dateName in p + + assert os.path.exists(expectedFile), 'Just a nicer way to say your test is FUBAR' + f = gzip.open(expectedFile) + lines = " ".join(f.readlines()) + f.close() + assert """{"header": "header", "data": ["line ONE", "lineTWO", "last line"]}""" == lines + + def testGetDumpPath(self): + storage = dumpStorage.ProcessedDumpStorage(self.testDir,**self.initKwargs[1]) + seq = 0 + seqs = {} + for ooid,(tdate,wh,pathprefix,longdatepath) in createJDS.jsonFileData.items(): + hh,slot = self.relativeDateParts(tdate,storage.minutesPerSlot) + seqs[ooid] = seq + expectedDir = os.sep.join((storage.root,self.dailyFromDate(tdate),storage.dateName,hh,"%s_0"%slot)) + expectedPath = os.path.join(expectedDir,"%s%s"%(ooid,storage.fileSuffix)) + stamp = datetime.datetime(*[int(x) for x in tdate.split('-')],tzinfo=UTC) + fh = storage.newEntry(ooid,stamp) + fh.write("Sequence Number %d\n"%seq) + fh.close() + seq += 1 + for ooid in createJDS.jsonFileData.keys(): + path = storage.getDumpPath(ooid) + f = gzip.open(path,'r') + lines = f.readlines() + f.close() + assert 1 == len(lines) + assert 'Sequence Number %d\n'%(seqs[ooid]) == lines[0],'But expected "Sequence Number %d\n", got "%s"'%(seqs[ooid],lines[0]) + assert_raises(OSError, storage.getDumpPath,createJDS.jsonBadUuid) + + def createDumpSet(self, dumpStorage): + for ooid,data in createJDS.jsonFileData.items(): + bogusData["uuid"] = ooid + stamp = datetime.datetime(*[int(x) for x in data[0].split('-')],tzinfo=UTC) + dumpStorage.putDumpToFile(ooid,bogusData,stamp) + + def testRemoveDumpFile(self): + storage = dumpStorage.ProcessedDumpStorage(self.testDir,**self.initKwargs[0]) + self.createDumpSet(storage) + expectedCount = len(createJDS.jsonFileData) + dumpFiles = set() + + # should fail quitely + storage.removeDumpFile(createJDS.jsonBadUuid) + + ooids = createJDS.jsonFileData.keys() + for dir,dirs,files in os.walk(storage.root): + dumpFiles.update(files) + assert expectedCount == len(dumpFiles) + + #should happily remove them each and all + for ooid in ooids: + dumpFiles = set() + storage.removeDumpFile(ooid) + expectedCount -= 1 + for dir,dirs,files in os.walk(storage.root): + dumpFiles.update(files) + assert expectedCount == len(dumpFiles),'\n %s: expected %d, but %d\n - %s'%(ooid,expectedCount,len(dumpFiles), '\n - '.join(dumpFiles)) + + def testGetDumpFromFile(self): + storage = dumpStorage.ProcessedDumpStorage(self.testDir,**self.initKwargs[0]) + self.createDumpSet(storage) + o = None + for ooid in createJDS.jsonFileData.keys(): + o = storage.getDumpFromFile(ooid) + bogusData['uuid'] = ooid + assert bogusData == o + assert_raises(OSError,storage.getDumpFromFile,createJDS.jsonBadUuid) + + def testSecondNewEntryAfterRemove(self): + storage = dumpStorage.ProcessedDumpStorage(self.testDir,**self.initKwargs[0]) + ooid,(tdate,ig1,pathprefix,longDatePath) = createJDS.jsonFileData.items()[1] + testStamp = datetime.datetime(*[int(x) for x in tdate.split('-')],tzinfo=UTC) + fh = storage.newEntry(ooid,testStamp) + fh.close() + storage.removeDumpFile(ooid) + #Next line fails ugly and useless unless we have fixed the problem + nh = None + try: + nh = storage.newEntry(ooid,testStamp) + finally: + if nh: + nh.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/socorro/unittest/external/hbase/__init__.py b/socorro/unittest/external/hbase/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/socorro/unittest/external/hbase/test_connection_context.py b/socorro/unittest/external/hbase/test_connection_context.py new file mode 100644 index 0000000000..aee27d0332 --- /dev/null +++ b/socorro/unittest/external/hbase/test_connection_context.py @@ -0,0 +1,187 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import mock +from nose.tools import eq_, ok_, assert_raises + +from socorro.external.hbase.connection_context import \ + HBaseConnectionContextPooled +from socorro.external.hbase.hbase_client import ( + FatalException, + NoConnectionException +) +from socorro.lib.util import SilentFakeLogger, DotDict +from socorro.database.transaction_executor import TransactionExecutor +from socorro.unittest.testbase import TestCase +from configman import Namespace + +from hbase import ttypes +from thrift import Thrift +from socket import timeout, error + + +class FakeHB_Connection(object): + def __init__(self): + self.hbaseThriftExceptions = (error,) + self.close_counter = 0 + self.commit_counter = 0 + self.rollback_counter = 0 + + def close(self): + self.close_counter += 1 + + def commit(self): + self.commit_counter += 1 + + def rollback(self): + self.rollback_counter += 1 + + +class TestConnectionContext(TestCase): + + @mock.patch('socorro.external.hbase.connection_context.hbase_client') + def test_basic_hbase_usage(self, mocked_hbcl): + local_config = DotDict({ + 'hbase_host': 'host', + 'database_name': 'name', + 'hbase_port': 9090, + 'hbase_timeout': 9000, + 'number_of_retries': 2, + 'logger': SilentFakeLogger(), + 'executor_identity': lambda: 'dwight' # bogus thread id + }) + a_fake_hbase_connection = FakeHB_Connection() + mocked_hbcl.HBaseConnectionForCrashReports = \ + mock.Mock(return_value=a_fake_hbase_connection) + hb_context = HBaseConnectionContextPooled( + local_config, + local_config + ) + eq_( + mocked_hbcl.HBaseConnectionForCrashReports.call_count, + 1 + ) + eq_( + a_fake_hbase_connection.close_counter, + 1 + ) + # open a connection + with hb_context() as conn: + eq_( + mocked_hbcl.HBaseConnectionForCrashReports.call_count, + 2 + ) + eq_( + a_fake_hbase_connection.close_counter, + 1 + ) + # get that same connection again + with hb_context() as conn: + eq_( + mocked_hbcl.HBaseConnectionForCrashReports.call_count, + 2 + ) + eq_( + a_fake_hbase_connection.close_counter, + 1 + ) + # get a named connection + with hb_context('fred') as conn: + eq_( + mocked_hbcl.HBaseConnectionForCrashReports.call_count, + 3 + ) + eq_( + a_fake_hbase_connection.close_counter, + 1 + ) + eq_( + len(hb_context.pool), + 2 + ) + # get that original same connection again + with hb_context() as conn: + eq_( + mocked_hbcl.HBaseConnectionForCrashReports.call_count, + 3 + ) + eq_( + a_fake_hbase_connection.close_counter, + 1 + ) + # close all connections + hb_context.close() + eq_( + a_fake_hbase_connection.close_counter, + 3 + ) + + @mock.patch('socorro.external.hbase.connection_context.hbase_client') + def test_hbase_usage_with_transaction(self, mocked_hbcl): + local_config = DotDict({ + 'hbase_host': 'host', + 'database_name': 'name', + 'hbase_port': 9090, + 'hbase_timeout': 9000, + 'number_of_retries': 2, + 'logger': SilentFakeLogger(), + 'executor_identity': lambda: 'dwight' # bogus thread id + }) + a_fake_hbase_connection = FakeHB_Connection() + mocked_hbcl.HBaseConnectionForCrashReports = \ + mock.Mock(return_value=a_fake_hbase_connection) + hb_context = HBaseConnectionContextPooled( + local_config, + local_config + ) + def all_ok(connection, dummy): + eq_(dummy, 'hello') + return True + + transaction = TransactionExecutor(local_config, hb_context) + result = transaction(all_ok, 'hello') + ok_(result) + eq_( + mocked_hbcl.HBaseConnectionForCrashReports.call_count, + 2 + ) + eq_( + a_fake_hbase_connection.close_counter, + 1 + ) + eq_( + a_fake_hbase_connection.rollback_counter, + 0 + ) + eq_( + a_fake_hbase_connection.commit_counter, + 1 + ) + + def bad_deal(connection, dummy): + raise KeyError('fred') + + assert_raises(KeyError, transaction, bad_deal, 'hello') + eq_( + mocked_hbcl.HBaseConnectionForCrashReports.call_count, + 2 + ) + eq_( + a_fake_hbase_connection.close_counter, + 1 + ) + eq_( + a_fake_hbase_connection.rollback_counter, + 1 + ) + eq_( + a_fake_hbase_connection.commit_counter, + 1 + ) + + hb_context.close() + eq_( + a_fake_hbase_connection.close_counter, + 2 + ) diff --git a/socorro/unittest/external/hbase/test_crash_data.py b/socorro/unittest/external/hbase/test_crash_data.py new file mode 100644 index 0000000000..112c1ab08a --- /dev/null +++ b/socorro/unittest/external/hbase/test_crash_data.py @@ -0,0 +1,225 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import os +from nose.plugins.skip import SkipTest +from configman import ConfigurationManager, Namespace +from mock import Mock, patch +from nose.tools import eq_, assert_raises + +from socorro.external import MissingArgumentError, ResourceNotFound, \ + ResourceUnavailable +from socorro.external.hbase import crash_data, crashstorage, hbase_client +from socorro.unittest.testbase import TestCase + + +_run_integration_tests = os.environ.get('RUN_HBASE_INTEGRATION_TESTS', False) +if _run_integration_tests in ('false', 'False', 'no', '0'): + _run_integration_tests = False + + + +class TestIntegrationHBaseCrashData(TestCase): + + def setUp(self): + if not _run_integration_tests: + raise SkipTest("Skipping HBase integration tests") + self.config_manager = self._common_config_setup() + + with self.config_manager.context() as config: + store = crashstorage.HBaseCrashStorage(config.hbase) + + # A complete crash report (raw, dump and processed) + fake_raw_dump_1 = 'peter is a swede' + fake_raw_dump_2 = 'lars is a norseman' + fake_raw_dump_3 = 'adrian is a frenchman' + fake_dumps = {'upload_file_minidump': fake_raw_dump_1, + 'lars': fake_raw_dump_2, + 'adrian': fake_raw_dump_3} + fake_raw = { + 'name': 'Peter', + 'legacy_processing': 0, + 'submitted_timestamp': '2013-05-04' + } + fake_processed = { + 'name': 'Peter', + 'uuid': '114559a5-d8e6-428c-8b88-1c1f22120314', + 'completeddatetime': '2012-01-01T00:00:00', + 'email': 'peter@fake.org', + } + + store.save_raw_crash( + fake_raw, + fake_dumps, + '114559a5-d8e6-428c-8b88-1c1f22120314' + ) + store.save_processed(fake_processed) + + # A non-processed crash report + fake_raw = { + 'name': 'Adrian', + 'legacy_processing': 0, + 'submitted_timestamp': '2013-05-04' + } + + store.save_raw_crash( + fake_raw, + fake_dumps, + '58727744-12f5-454a-bcf5-f688a2120821' + ) + + def tearDown(self): + with self.config_manager.context() as config: + connection = hbase_client.HBaseConnectionForCrashReports( + config.hbase.hbase_host, + config.hbase.hbase_port, + config.hbase.hbase_timeout + ) + for row in connection.merge_scan_with_prefix( + 'crash_reports', '', ['ids:ooid']): + index_row_key = row['_rowkey'] + connection.client.deleteAllRow( + 'crash_reports', index_row_key) + # because of HBase's async nature, deleting can take time + list(connection.iterator_for_all_legacy_to_be_processed()) + + def _common_config_setup(self): + mock_logging = Mock() + required_config = Namespace() + required_config.namespace('hbase') + required_config.hbase = \ + crashstorage.HBaseCrashStorage.get_required_config() + required_config.hbase.add_option('logger', default=mock_logging) + config_manager = ConfigurationManager( + [required_config], + app_name='testapp', + app_version='1.0', + app_description='app description', + values_source_list=[{'hbase': { + 'logger': mock_logging + }}] + ) + return config_manager + + @patch('socorro.external.rabbitmq.priorityjobs.Priorityjobs') + def test_get(self, priorityjobs_mock): + with self.config_manager.context() as config: + + #priorityjobs_mock = Mock() + service = crash_data.CrashData( + config=config, + all_services={'Priorityjobs': priorityjobs_mock} + ) + params = { + 'datatype': 'raw', + 'uuid': '114559a5-d8e6-428c-8b88-1c1f22120314' + } + + # Test 1: get a raw dump + res_expected = ('peter is a swede', + 'application/octet-stream') + res = service.get(**params) + + eq_(res, res_expected) + + # Test 2: get a raw crash + params['datatype'] = 'meta' + res_expected = { + 'name': 'Peter', + 'legacy_processing': 0, + 'submitted_timestamp': '2013-05-04' + } + res = service.get(**params) + + eq_(res, res_expected) + + # Test 3: get a processed crash + params['datatype'] = 'processed' + res_expected = { + 'name': 'Peter', + 'uuid': '114559a5-d8e6-428c-8b88-1c1f22120314', + 'completeddatetime': '2012-01-01T00:00:00' + } + res = service.get(**params) + + eq_(res, res_expected) + + # Test 3a: get a unredacted processed crash + params['datatype'] = 'unredacted' + res_expected = { + 'name': 'Peter', + 'uuid': '114559a5-d8e6-428c-8b88-1c1f22120314', + 'completeddatetime': '2012-01-01T00:00:00', + 'email': 'peter@fake.org', + } + res = service.get(**params) + + eq_(res, res_expected) + + # Test 4: missing parameters + assert_raises( + MissingArgumentError, + service.get + ) + assert_raises( + MissingArgumentError, + service.get, + **{'uuid': '114559a5-d8e6-428c-8b88-1c1f22120314'} + ) + + # Test 5: crash cannot be found + assert_raises( + ResourceNotFound, + service.get, + **{ + 'uuid': 'c44245f4-c93b-49b8-86a2-c15dc2130504', + 'datatype': 'processed' + } + ) + # Test 5a: crash cannot be found + assert_raises( + ResourceNotFound, + service.get, + **{ + 'uuid': 'c44245f4-c93b-49b8-86a2-c15dc2130504', + 'datatype': 'unredacted' + } + ) + + # Test 6: not yet available crash + assert_raises( + ResourceUnavailable, + service.get, + **{ + 'uuid': '58727744-12f5-454a-bcf5-f688a2120821', + 'datatype': 'processed' + } + ) + priorityjobs_mock.cls.return_value.create.assert_called_once_with( + uuid='58727744-12f5-454a-bcf5-f688a2120821' + ) + priorityjobs_mock.cls.return_value.create.reset_mock() + + # Test 6a: not yet available crash + assert_raises( + ResourceUnavailable, + service.get, + **{ + 'uuid': '58727744-12f5-454a-bcf5-f688a2120821', + 'datatype': 'unredacted' + } + ) + priorityjobs_mock.cls.return_value.create.assert_called_once_with( + uuid='58727744-12f5-454a-bcf5-f688a2120821' + ) + + # Test 7: raw crash cannot be found + assert_raises( + ResourceNotFound, + service.get, + **{ + 'uuid': 'c44245f4-c93b-49b8-86a2-c15dc2130505', + 'datatype': 'raw' + } + ) diff --git a/socorro/unittest/external/hbase/test_crashstorage.py b/socorro/unittest/external/hbase/test_crashstorage.py new file mode 100644 index 0000000000..a20ed72113 --- /dev/null +++ b/socorro/unittest/external/hbase/test_crashstorage.py @@ -0,0 +1,455 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import os +import time +import json +from contextlib import nested + +import mock +from nose.tools import eq_, ok_, assert_raises +from configman import ConfigurationManager + +from socorro.external.hbase import hbase_client + +from socorro.external.crashstorage_base import ( + CrashIDNotFound, + Redactor, + MemoryDumpsMapping +) +from socorro.external.hbase.crashstorage import HBaseCrashStorage +from socorro.external.hbase.connection_context import \ + HBaseConnectionContextPooled +from socorro.lib.util import DotDict +from socorro.unittest.config import commonconfig +from socorro.database.transaction_executor import ( + TransactionExecutorWithLimitedBackoff +) +from socorro.unittest.testbase import TestCase + + +class SomeThriftError(Exception): + pass + +_run_integration_tests = os.environ.get('RUN_HBASE_INTEGRATION_TESTS', False) +if _run_integration_tests in ('false', 'False', 'no', '0'): + _run_integration_tests = False + + +if not _run_integration_tests: + import logging + logging.warning("Skipping HBase integration tests") + +else: + + class TestIntegrationHBaseCrashStorage(TestCase): + """ + If you ever get this:: + Traceback (most recent call last): + ... + socorro.external.hbase.hbase_client.FatalException: the connection + is not viable. retries fail: + + Then try the following: + + /etc/init.d/hadoop-hbase-master restart + /etc/init.d/hadoop-hbase-thrift restart + + Also, you can look in /var/log/hbase for clues. + Still not working, try: + + hbase shell + > describe 'crash_reports' + + and keep an eye on the logs. + """ + + def tearDown(self): + super(TestIntegrationHBaseCrashStorage, self).tearDown() + self._truncate_hbase_table() + + def _truncate_hbase_table(self): + connection = hbase_client.HBaseConnectionForCrashReports( + commonconfig.hbaseHost.default, + commonconfig.hbasePort.default, + 100 + ) + for row in connection.merge_scan_with_prefix( + 'crash_reports', '', ['ids:ooid']): + index_row_key = row['_rowkey'] + connection.client.deleteAllRow( + 'crash_reports', index_row_key) + # because of HBase's async nature, deleting can take time + list(connection.iterator_for_all_legacy_to_be_processed()) + + def test_basic_hbase_crashstorage(self): + mock_logging = mock.Mock() + required_config = HBaseCrashStorage.required_config + required_config.add_option('logger', default=mock_logging) + + config_manager = ConfigurationManager( + [required_config], + app_name='testapp', + app_version='1.0', + app_description='app description', + values_source_list=[{ + 'logger': mock_logging, + 'hbase_timeout': 100, + 'hbase_host': commonconfig.hbaseHost.default, + 'hbase_port': commonconfig.hbasePort.default, + }], + argv_source=[] + ) + with config_manager.context() as config: + crashstorage = HBaseCrashStorage(config) + eq_(list(crashstorage.new_crashes()), []) + + crash_id = '86b58ff2-9708-487d-bfc4-9dac32121214' + + raw = ('{"name":"Peter", ' + '"submitted_timestamp":"%d"}' % time.time()) + fake_raw_dump_1 = 'peter is a swede' + fake_raw_dump_2 = 'lars is a norseman' + fake_raw_dump_3 = 'adrian is a frenchman' + fake_dumps = MemoryDumpsMapping({ + 'upload_file_minidump': fake_raw_dump_1, + 'lars': fake_raw_dump_2, + 'adrian': fake_raw_dump_3 + }) + crashstorage.save_raw_crash(json.loads(raw), + fake_dumps, + crash_id) + + assert config.logger.info.called + assert config.logger.info.call_count > 1 + msg_tmpl, msg_arg = config.logger.info.call_args_list[1][0] + # ie logging.info(