Analysis with a ROOT Macro

Running a ROOT macro over a set of input parameters and files is a common analysis strategy within HEP.

Table of Contents

  • run_script.sh : The actual executable run by condor. It sets up the environment and runs root.
  • anamacro.C : The ROOT macro that is being run. It imports that analyzer defined elsewhere and then gives the parameters parsed from the command line.
  • ana.sub : The condor sumbission file
  • list-queue.sh : The script that parses job and sample listing files for the input parameters and prints the arguments one line per job

The job submission is separated out like this so that you can do a series of tests to make sure that things are operating as expected.

  1. Make sure anamacro.C runs through ROOT as you are developing your analyzer.
  2. Make sure run_script.sh runs the same to make sure it sets up the correct environment
  3. Run list-queue.sh to make sure that the jobs are being fed the correct parameters
  4. Submit to the cluster

run_script.sh

#!/bin/bash

####################################################################################
# run ROOT macro 
#
# INPUTS:
#   1 - comma-separated list of input file names (condor copies them to CWD)
#   2 - output file name (condor copies them out of CWD)
#   3 - xsec
#   4 - MC pileup file
####################################################################################

set -e

source /cvmfs/cms.cern.ch/cmsset_default.sh

echo -e "Running job on machine:"
hostname

cd /local/cms/user/wadud/aNTGCmet/CMSSW_10_6_24/src
eval `scramv1 runtime -sh`
cd -

# write the local file listing the input data files (one per line)
echo "$1" | tr , '\n' > $2.inputfiles.list

root -l -b -q "/local/cms/user/eichl008/umn-cluster/user-manual/condor/rootmacro/anamacro.C(\"$2.inputfiles.list\",\"$2\",$3,\"$4\")"

anamacro.C

#include "/local/cms/user/gsorrent/antgc_analysis/hltSF/UL17/hltScaleFactorNtuples.cc"

/**
 * Run like:
 *    root -b -l -q '/full/path/to/macro.C("input_file","output_file",xsec,"mcpu")'
 */
void anamacro(std::string input_file, std::string output_file, 
                         double xsec, std::string mcpu){
	std::cout<<getCurrentTime()<<std::endl;
	std::cout<<"Begin root macro..."<<std::endl;
	
	hltScaleFactorNtuples(input_file, output_file, xsec, mcpu, 
      "/local/cms/user/wadud/aNTGCmet/aNTGC_analysis/data/pileupUL17/pileup_2017_data.root");

	std::cout<<"End root macro!"<<std::endl;
	std::cout<<getCurrentTime()<<std::endl;
};

ana.sub

# we run inside of CMS's CentOS7 container
executable = /bin/bash

# CVMFS is distributed to all nodes so the executable itself
#   does not need to be copied
transfer_executable = no

# we want condor to copy any output files for us
should_transfer_files   = Yes

# have condor wait until the end to copy
#   you could also have this be ON_SUCCESS if you only want
#   output files when the run script returns 0
when_to_transfer_output = ON_EXIT

# the script to run our job within the CentOS7 container
#   this should be a full path and should be on /local/cms/...
#   so that it can be seen by all nodes
run_script = /local/cms/user/eichl008/umn-cluster/user-manual/condor/rootmacro/run_script.sh

# the input file for the job
#   notice we can use the variable that is defined in the `queue` command below
transfer_input_files = $(input_file)

# terminal and condor output log files
#   this is helpful for debugging purposes but you can delete these lines
#   for slightly better performance
output = $BASENAME(input_file).out
error  = $(output)
log    = $BASENAME(input_file).log

# "hold" the job if the script exits with a non-zero exit code
#   this is a helpful way to list which jobs failed
#   we also store the failure-status in the hold reason sub code so you
#   can see it using condor_q
on_exit_hold = ExitCode != 0
on_exit_hold_subcode = ExitCode
on_exit_hold_reason = "Program exited with non-zero error status (stored in HoldReasonSubCode)"

# we need to pass a specific environment variable to the condor job so
#   the directories we need are mounted to the container
environment = SINGULARITY_BIND=/home/,/local/cms/,/export/scratch/

# the command line arguments given to the executable
#   from condor's point of view, the executable is the container above,
#   so we need to pass the container commands first and then the run script we will run inside
#   I use the condor macro `BASENAME` here since condor will copy the input file to the working directory
#   the other arguments are also pulled from the variables defined by the `queue` command
arguments = "/cvmfs/cms.cern.ch/common/cmssw-cc7 --command-to-run $(run_script) $BASENAME(input_file) $(output_file) $(xsec) $(pileup_file)"

# submit a job for each line of arguments printed by the list-queue.sh bash script
#   the arguments are comma-separated, run ./list-queue.sh directly to see what
#   the output looks like without being consumed by condor
# the "initialdir" is used as the default directory for relative paths in a lot of condor_submit stuff
#   for us this is the relative path for output files so set this to the full path to the output directory
queue input_file, output_file, xsec, pileup_file, initialdir from ./list-queue.sh |

list-queue.sh

#!/bin/bash

# list job queue for analysis

outputDir=/local/cms/user/eichl008/umn-cluster/user-manual/condor/rootmacro/2out2put
sample_file=/local/cms/user/gsorrent/antgc_analysis/samples/aNTGC_Samples-ntuples_2017UL.csv
job_list=/local/cms/user/gsorrent/antgc_analysis/hltSF/UL17/batch/jobList.txt
splitfiles=1

readarray -t jobList < ${job_list}
{
  read
  while IFS=, read -r shortName dataset xSec xSecUnc singleJobFileList mcPUfile Nevents SumW SumW2 Neff lumi || [ -n "$shortName" ];
  do
    if [[ ! " ${jobList[@]} " =~ " ${shortName} " ]]; then
      continue
    fi

    jobBaseName=$(basename "${singleJobFileList}")
    jobBaseName="${jobBaseName%.*}"
    jobOutDir=${outputDir}/${jobBaseName}

    # create job directories
    [ -d "${jobOutDir}" ] || mkdir -p "${jobOutDir}"

    nFiles=$(sed -n '=' ${singleJobFileList} | wc -l)
    # make sure any data path left on HDFS is updated to its new location on LOCAL
    #   also write the input file list to a file for later reading
    sed "s|/hdfs/cms/user/wadud/anTGC|/local/cms/user/wadud/aNTGCmet|" ${singleJobFileList} > ${jobOutDir}/input_files.list

    i=0
    while read input_file; do
      i=$((i+1))
      # skip files that don't exist
      [ -f ${input_file} ] || continue
      # print row of inputs to run script
      echo "${input_file}, $(printf "%s_%05d.root" ${jobBaseName} ${i}), ${xSec}, ${mcPUfile}, ${jobOutDir}"
    done < ${jobOutDir}/input_files.list
  done
} < ${sample_file}

exit 0