PublicAnalysisDatabase: exclusion_CLs.py

File exclusion_CLs.py, 16.5 KB (added by BenjF, 2 years ago)

Statistics module for testing exclusion

Line 
1#! /usr/bin/env python
2
3# derive exclusions for the tested scenario from the signal regions of an
4# analysis implemented using MadAnalysis 5 (MA5), using a simplified procedure
5# and the CLs prescription
6
7# version 1.1 (July 9, 2014)
8# made by Beranger Dumont
9# based on toy MC code by Benjamin Fuks, Chris Wymant and Sam Bein
10# v1.1: BF added an efficiency map calculator
11
12# takes as input:
13# -- an XML analysis_name.info file that should be present in the same directory
14# as the analysis code, i.e. Build/SampleAnalyzer/User/Analyzer/
15# -- the SAF files for the cutflows of the tested scenario of interest in all
16# signal regions, where the information on acceptance*efficiency can be found
17# -- the cross section for the tested scenario (if not given as argument in
18# command-line, it is looked for in the SAF file analysis_name.saf
19
20# returns the output on the screen and print basic results
21# in the file analysis_name_[run number].out in
22# Output/benchmark_point/
23
24
25# path to the directory where the analysis code and output is present
26# (this is the directory created when running MA5 in expert mode,
27# containing the "Build", "Input" and "Output" directories)
28analysis_path = "./"
29
30# The number of Poisson distributions we consider (each one effectively being
31# a toy experiment with its own certain prediction for the background):
32numtoyexperiments = 100000
33
34#
35# the user is not supposed to modify the code below this line
36#
37
38import os, sys
39try:
40    from lxml import ET
41except:
42    import xml.etree.ElementTree as ET
43import scipy.stats
44
45def usage():
46    print 'Usage: ./exclusion_CLs.py analysis_name benchmark_point ' + \
47      '[run_number] [cross section in pb]'
48    print 'Example: ./exclusion_CLs.py cms_sus_13_011 T2tt_650_50.txt ' + \
49      '0 0.014'
50    print 'Default value of run number is 0'
51    print 'If the cross section is not given, it is taken from the MA5 output'
52
53def listdirectory(path):
54    lfile=[]
55    for root, dirs, files in os.walk(path): 
56        for i in files: 
57            lfile.append(os.path.join(root, i))
58    return lfile
59
60bestSR = ""
61
62def clean_name(str):
63    # based on CleanName() from
64    # ./tools/SampleAnalyzer/Process/Core/SampleAnalyzer.cpp
65    # in MA5 v1.1.11beta4
66    str = str.replace("/",  "_slash_")
67    str = str.replace("->", "_to_")
68    str = str.replace(">=", "_greater_than_or_equal_to_")
69    str = str.replace(">",  "_greater_than_")
70    str = str.replace("<=", "_smaller_than_or_equal_to_")
71    str = str.replace("<",  "_smaller_than_")
72    str = str.replace(" ",  "_")
73    str = str.replace(",",  "_")
74    str = str.replace("+",  "_")
75    str = str.replace("-",  "_")
76    str = str.replace("(",  "_lp_")
77    str = str.replace(")",  "_rp_")
78    return str
79
80def CLs(NumObserved, ExpectedBG, BGError, SigHypothesis, NumToyExperiments):
81    # generate a set of expected-number-of-background-events, one for each toy
82    # experiment, distributed according to a Gaussian with the specified mean
83    # and uncertainty
84    ExpectedBGs = scipy.stats.norm.rvs(loc=ExpectedBG, \
85    scale=BGError, size=NumToyExperiments)
86
87    # Ignore values in the tail of the Gaussian extending to negative numbers
88    ExpectedBGs = [value for value in ExpectedBGs if value > 0]
89
90    # For each toy experiment, get the actual number of background events by
91    # taking one value from a Poisson distribution created using the expected
92    # number of events.
93    ToyBGs = scipy.stats.poisson.rvs(ExpectedBGs)
94    ToyBGs = map(float, ToyBGs)
95
96    # The probability for the background alone to fluctutate as LOW as
97    # observed = the fraction of the toy experiments with backgrounds as low as
98    # observed = p_b.
99    # NB (1 - this p_b) corresponds to what is usually called p_b for CLs.
100    p_b = scipy.stats.percentileofscore(ToyBGs, NumObserved, kind='weak')*.01
101
102    # Toy MC for background+signal
103    ExpectedBGandS = [expectedbg + SigHypothesis for expectedbg in ExpectedBGs]
104    ToyBplusS = scipy.stats.poisson.rvs(ExpectedBGandS)
105    ToyBplusS = map(float, ToyBplusS)
106
107    # Calculate the fraction of these that are >= the number observed,
108    # giving p_(S+B). Divide by (1 - p_b) a la the CLs prescription.
109    p_SplusB = scipy.stats.percentileofscore(ToyBplusS, NumObserved, kind='weak')*.01
110   
111    return 1.-(p_SplusB / p_b) # 1 - CLs
112
113def exclusion_check(crosssection):
114    global bestSR
115
116    if len(signalregions) > 1:
117        # if more than one signal region, decide which signal regions
118        # yields the best expected limit
119        limit = -1.
120        for SR in signalregions:
121            nsignal = crosssection * lumi * 1000. * signalregions[SR]["acceff"]
122            nb = signalregions[SR]["nb"]
123            deltanb = signalregions[SR]["deltanb"]
124
125            limitSR = CLs(nb, nb, deltanb, nsignal, numtoyexperiments)
126            if limitSR > limit:
127                bestSR = SR
128                limit = limitSR
129    else:
130      bestSR = signalregions.keys()[0]
131
132    nsignal = crosssection * lumi * 1000. * signalregions[bestSR]["acceff"]
133    nobs = int(signalregions[bestSR]["nobs"])
134    nb = signalregions[bestSR]["nb"]
135    deltanb = signalregions[bestSR]["deltanb"]
136    bestCLs=CLs(nobs, nb, deltanb, nsignal, numtoyexperiments)
137
138    print 'The best expected signal region is "' + bestSR + '".'
139    print 'It has: nobs = ' + str(nobs) + ', nb = ' + str(nb) + ' \pm ' + \
140      str(deltanb) + ', nsignal = ' + str(round(nsignal,2)) + '.'
141
142    return bestCLs
143
144def exclusion_check95(crosssection):
145    return exclusion_check(crosssection)-0.95
146
147if analysis_path[-1] == "/":
148    analysis_path = analysis_path[:-1]
149
150# at least one argument, check if asking for help
151if len(sys.argv) < 3 or (sys.argv[1] == "-h" or sys.argv[1] == "--help"):
152    usage()
153    sys.exit()
154
155analysis_name = sys.argv[1]
156bench_name = sys.argv[2]
157if len(sys.argv) > 3:
158    run_number = sys.argv[3]
159else:
160    run_number = "0"
161if len(sys.argv) > 4:
162    try:
163        xsection = float(sys.argv[4])
164    except ValueError:
165        print 'Invalid cross section given as command-line argument: "' + \
166          sys.argv[4]+ '".'
167        sys.exit()
168else:
169    xsection = 0
170
171
172
173######################################
174# first, read the XML .info file
175# and fill the variable lumi
176# and the dictionary signalregions
177######################################
178
179lumi = 0 # integrated luminosity, in fb^-1
180signalregions = {}
181
182analysisinfo_path = analysis_path + "/Build/SampleAnalyzer/User/Analyzer/" + \
183    analysis_name + ".info"
184
185try:
186    info_input = open(analysisinfo_path)
187except IOError as e:
188    print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
189    print 'Cannot open the XML info file "' + analysisinfo_path + '".'
190    sys.exit()
191
192info_tree = ET.parse(info_input)
193info_input.close()
194
195info_root = info_tree.getroot()
196if info_root.tag != "analysis":
197    print 'Invalid XML info file "' + analysisinfo_path + '".'
198    print 'Root tag should be <analysis>, not <' + info_root.tag + '>.'
199    sys.exit()
200if info_root.attrib["id"] != analysis_name:
201    print 'Invalid XML info file "' + analysisinfo_path + '".'
202    print 'Analysis id in root tag <analysis> should be "' + analysis_name + \
203          '", not "' + info_root.attrib["id"] + '".'
204    sys.exit()
205
206
207for child in info_root:
208    # for <lumi> tag
209    if child.tag == "lumi":
210        if lumi != 0:
211            print 'Warning: redefinition of the luminosity in the ' + \
212              'XML info file "' + analysisinfo_path + '".'
213
214        try:
215            lumi = float(child.text)
216        except TypeError: # empty tag is of type NULL
217            lumi = 0
218        except ValueError:
219            print 'Invalid XML info file "' + analysisinfo_path + '".'
220            print 'The value of the <lumi> tag is not a number.'
221            sys.exit()
222
223    # for the <region> tags
224    # if no type is specified, assumed to be a signal region
225    if child.tag == "region" and \
226      ("type" not in child.attrib or child.attrib["type"] == "signal"):
227        if "id" not in child.attrib:
228            print 'Invalid XML info file "' + analysisinfo_path + '".'
229            print 'Presence of <region> tags with no id attribute.'
230            sys.exit()
231
232        regionid = child.attrib["id"]
233
234        if regionid in signalregions:
235            # a <region> tag with the same id has already been defined
236            print 'Invalid XML info file "' + analysisinfo_path + '".'
237            print 'A region with id="' + regionid + ' is defined ' + \
238              'multiple times.'
239            sys.exit()
240
241        signalregions[regionid] = {"acceff": 0, "syst":0, "stat":0} # initialize efficiency to 0
242        for rchild in child:
243            if rchild.tag in ["nobs", "nb", "deltanb"]:
244                ntag = rchild.tag
245                if ntag in signalregions[regionid]:
246                    print 'Warning: redefinition of <' + ntag + '> in the ' + \
247                      'region "' + \
248                      regionid + '" of the XML info file "' + \
249                      analysisinfo_path + '".'
250
251                try:
252                    signalregions[regionid][ntag] = float(rchild.text)
253                except TypeError: # empty tag is of type NULL
254                    signalregions[regionid][ntag] = 0
255                except ValueError:
256                    print 'Invalid XML info file "' + analysisinfo_path + '".'
257                    print 'The value of the <' + ntag + '> tag in region "' + \
258                     regionid + '" is not a number.'
259                    sys.exit()
260
261######################################
262# then, read the SAF files
263# for the cutflows
264# generated by MA5
265######################################
266
267analysisinfo_path = analysis_path + "/Output/" + \
268    bench_name + "/" + analysis_name + "_" + run_number + "/Cutflows"
269
270listdir = listdirectory(analysisinfo_path)
271
272if not listdir:
273    print 'The directory "' + analysisinfo_path + '" containing the SAF' + \
274      ' files for the cutflows cannot be listed.'
275    sys.exit()
276
277for file in listdir:
278    # signal region (as defined in the XML info file)
279    # associated with the cutflow file
280    assoc_SR = ""
281
282    if file[-4:] != ".saf":
283        continue
284    SRname = file.split("/")[-1][:-4]
285
286    for regionid in signalregions:
287        for sr in regionid.split(";"):
288            if clean_name(sr) == SRname:
289                assoc_SR = sr
290
291    # if there is no signal region found associated with the SAF file
292    if assoc_SR == "":
293        print 'Warning: no region found associated with the SAF file "' + \
294          file + '"; will be skipped.'
295        continue
296
297    # otherwise, read the acceptance times efficiency from the SAF file
298    try:
299        SAF_cutflow = open(file)
300    except IOError as e:
301        print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
302        print 'Cannot open the XML info file "' + file + '".'
303        sys.exit()
304
305    in_initialcounter = False
306    in_counter = False
307    initialnum = 0
308    finalnum = 0
309    for line in SAF_cutflow:
310        line = line.rstrip("\n").strip()
311        if line[:16] == "<InitialCounter>":
312            in_initialcounter = True
313            continue
314        if line[:17] == "</InitialCounter>":
315            in_initialcounter = False
316            continue
317        if line[:9] == "<Counter>":
318            in_counter = True
319            continue
320        if line[:10] == "</Counter>":
321            in_counter = False
322            continue
323
324        if in_initialcounter and line[-14:] == "sum of weights":
325            try:
326                initialnum = float(line.split()[0])
327            except:
328                print 'Invalid SAF file "' + file + '".'
329                print 'The initial number of events cannot be read.'
330                sys.exit()
331
332        if in_counter and line[-14:] == "sum of weights":
333            try:
334                finalnum = float(line.split()[0])
335            except:
336                print 'Invalid SAF file "' + file + '".'
337                print 'The number of events cannot be read.'
338                sys.exit()
339
340    if initialnum == 0:
341        print 'Invalid SAF file "' + file + '".'
342        print 'The number of events cannot be read.'
343        sys.exit()
344
345    for regionid in signalregions:
346        # for each SR id as defined in the XML info file...
347        for individualSR in regionid.split(";"):
348            # ...look for each individual SR...
349            if individualSR == assoc_SR:
350                # ... if it matches with the SAF file read, add the
351                # acceptance*efficiency to existing value
352                signalregions[regionid]["acceff"] += finalnum / initialnum
353                signalregions[regionid]["stat"] += finalnum
354
355######################################
356# then, get the cross section info
357# if not given as command-line
358# argument, look into SAF file
359######################################
360
361if xsection == 0: # not given as command-line argument
362    mainSAF_path = analysis_path + "/Output/" + \
363    bench_name + "/" + bench_name + ".saf"
364
365    try:
366        mainSAF = open(mainSAF_path)
367    except IOError as e:
368        print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
369        print 'Cannot open the XML info file "' + mainSAF_path + '".'
370        sys.exit()
371
372    nline = 0
373    for line in mainSAF:
374        line = line.rstrip("\n").strip()
375        if line[:18] == "<SampleGlobalInfo>":
376            nline = 1
377            continue
378        if nline > 0:
379            nline += 1
380        if nline == 3:
381            try:
382                xsection = float(line.split()[0])
383            except:
384                print 'Invalid SAF file "' + mainSAF_path + '".'
385                print 'The cross section cannot be read.'
386                sys.exit()
387
388            if xsection <= 0.:
389                print 'Invalid cross section of ' + str(xsection) + ' pb.'
390                print 'The cross section cannot be read from the SAF file "' + \
391                  mainSAF_path + '".'
392                sys.exit()
393            break
394
395######################################
396# now, we have the information on the
397# cross section (variable xsection),
398# the luminosity (variable lumi),
399# the number of background events,
400# observed events, and the acceptance
401# times efficiency (in the dictionary
402# signalregions)
403#
404# we can proceed with the exclusion
405######################################
406
407if xsection > 0:
408    # first, decide which signal regions yields the best expected limit
409    final_limit = exclusion_check(xsection)
410
411    print '\nThis signal is excluded at the ' + str(round(final_limit*100.,1)) + \
412      '% CL (CLs=' + str(round(1-final_limit,3)) + ').'
413else:
414    # if a  negative cross section is given as input,
415    # the code is looking for the cross section that is excluded at 95% CL
416    # using a root-finding algorithm
417
418    print 'Negative cross section is given.'
419    print 'Will look for the cross section that is excluded at 95% CL'
420    print 'using a root-finding algorithm.\n'
421
422    # need to tune the lower and upper bound (corresponding to a cross section
423    # that we know is not excluded or excluded, respectively)
424    lowerb = 1. # 1 pb
425    upperb = 1. # 1 pb
426    while exclusion_check95(lowerb) > 0.:
427        lowerb *= 0.1
428    while exclusion_check95(upperb) < 0.:
429        upperb *= 10.
430
431    print '\nlower and upper bounds for the root-finding algorithm' + \
432      ' have been found: [%.2e %.2e] pb\n' % (lowerb, upperb)
433
434    final_limit = scipy.optimize.brentq(exclusion_check95, lowerb, upperb)
435
436    print '\nThe excluded cross section at 2 sigma is %.5E pb.' % final_limit
437
438# finally, write the results in an output file
439output_path = analysis_path + "/Output/" + \
440    bench_name + "/" + analysis_name + "_" + run_number + ".out"
441
442try:
443    output = open(output_path, "w")
444except IOError as e:
445    print 'I/O error({0}): {1}'.format(e.errno, e.strerror)
446    print 'Cannot create the output file "' + output_path + '".'
447    sys.exit()
448
449output.write(bestSR+'\n'+str(final_limit))
450output.close()
451
452# Efficiencies
453import math
454out = open('efficiencies_'+analysis_name+'.dat','w')
455Title=['Signal region', 'Efficiency', 'Stat. error', 'Syst. error', 'Total error', 'Best region']
456out.write(Title[0].center(50) + Title[1].center(16) + Title[2].center(21) + Title[3].center(16) +\
457    Title[4].center(18)+ Title[5].rjust(10)+'\n')
458for region in signalregions:
459  if signalregions[region]["stat"]!=0:
460     signalregions[region]["stat"] =1./math.sqrt(float(signalregions[region]["stat"]))
461  signalregions[region]["syst"] = float(signalregions[region]["syst"])
462  IsBest=False;
463  if region is bestSR:
464   IsBest=True
465  out.write(region.replace(' ','').ljust(50) + "%1.5f".center(15) %(signalregions[region]["acceff"]) +\
466     "%1.6f".center(15) %(signalregions[region]["stat"])+ \
467     "%1.6f".center(15) %(signalregions[region]["syst"])+ \
468     "%1.6f".center(15) %(math.sqrt(signalregions[region]["syst"]**2+signalregions[region]["stat"]**2))+\
469     str(int(IsBest)).rjust(8) +\
470     '\n')
471out.close()