1 // Copyright 2014 The Prometheus Authors
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
6 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
25 "github.com/beorn7/perks/quantile"
26 "github.com/golang/protobuf/proto"
28 dto "github.com/prometheus/client_model/go"
31 // quantileLabel is used for the label that defines the quantile in a
33 const quantileLabel = "quantile"
35 // A Summary captures individual observations from an event or sample stream and
36 // summarizes them in a manner similar to traditional summary statistics: 1. sum
37 // of observations, 2. observation count, 3. rank estimations.
39 // A typical use-case is the observation of request latencies. By default, a
40 // Summary provides the median, the 90th and the 99th percentile of the latency
41 // as rank estimations. However, the default behavior will change in the
42 // upcoming v0.10 of the library. There will be no rank estimations at all by
43 // default. For a sane transition, it is recommended to set the desired rank
44 // estimations explicitly.
46 // Note that the rank estimations cannot be aggregated in a meaningful way with
47 // the Prometheus query language (i.e. you cannot average or add them). If you
48 // need aggregatable quantiles (e.g. you want the 99th percentile latency of all
49 // queries served across all instances of a service), consider the Histogram
50 // metric type. See the Prometheus documentation for more details.
52 // To create Summary instances, use NewSummary.
53 type Summary interface {
57 // Observe adds a single observation to the summary.
61 // DefObjectives are the default Summary quantile values.
63 // Deprecated: DefObjectives will not be used as the default objectives in
64 // v0.10 of the library. The default Summary will have no quantiles then.
66 DefObjectives = map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}
68 errQuantileLabelNotAllowed = fmt.Errorf(
69 "%q is not allowed as label name in summaries", quantileLabel,
73 // Default values for SummaryOpts.
75 // DefMaxAge is the default duration for which observations stay
77 DefMaxAge time.Duration = 10 * time.Minute
78 // DefAgeBuckets is the default number of buckets used to calculate the
79 // age of observations.
81 // DefBufCap is the standard buffer size for collecting Summary observations.
85 // SummaryOpts bundles the options for creating a Summary metric. It is
86 // mandatory to set Name to a non-empty string. While all other fields are
87 // optional and can safely be left at their zero value, it is recommended to set
88 // a help string and to explicitly set the Objectives field to the desired value
89 // as the default value will change in the upcoming v0.10 of the library.
90 type SummaryOpts struct {
91 // Namespace, Subsystem, and Name are components of the fully-qualified
92 // name of the Summary (created by joining these components with
93 // "_"). Only Name is mandatory, the others merely help structuring the
94 // name. Note that the fully-qualified name of the Summary must be a
95 // valid Prometheus metric name.
100 // Help provides information about this Summary.
102 // Metrics with the same fully-qualified name must have the same Help
106 // ConstLabels are used to attach fixed labels to this metric. Metrics
107 // with the same fully-qualified name must have the same label names in
108 // their ConstLabels.
110 // Due to the way a Summary is represented in the Prometheus text format
111 // and how it is handled by the Prometheus server internally, “quantile”
112 // is an illegal label name. Construction of a Summary or SummaryVec
113 // will panic if this label name is used in ConstLabels.
115 // ConstLabels are only used rarely. In particular, do not use them to
116 // attach the same labels to all your metrics. Those use cases are
117 // better covered by target labels set by the scraping Prometheus
118 // server, or by one specific metric (e.g. a build_info or a
119 // machine_role metric). See also
120 // https://prometheus.io/docs/instrumenting/writing_exporters/#target-labels,-not-static-scraped-labels
123 // Objectives defines the quantile rank estimates with their respective
124 // absolute error. If Objectives[q] = e, then the value reported for q
125 // will be the φ-quantile value for some φ between q-e and q+e. The
126 // default value is DefObjectives. It is used if Objectives is left at
127 // its zero value (i.e. nil). To create a Summary without Objectives,
128 // set it to an empty map (i.e. map[float64]float64{}).
130 // Deprecated: Note that the current value of DefObjectives is
131 // deprecated. It will be replaced by an empty map in v0.10 of the
132 // library. Please explicitly set Objectives to the desired value.
133 Objectives map[float64]float64
135 // MaxAge defines the duration for which an observation stays relevant
136 // for the summary. Must be positive. The default value is DefMaxAge.
139 // AgeBuckets is the number of buckets used to exclude observations that
140 // are older than MaxAge from the summary. A higher number has a
141 // resource penalty, so only increase it if the higher resolution is
142 // really required. For very high observation rates, you might want to
143 // reduce the number of age buckets. With only one age bucket, you will
144 // effectively see a complete reset of the summary each time MaxAge has
145 // passed. The default value is DefAgeBuckets.
148 // BufCap defines the default sample stream buffer size. The default
149 // value of DefBufCap should suffice for most uses. If there is a need
150 // to increase the value, a multiple of 500 is recommended (because that
151 // is the internal buffer size of the underlying package
152 // "github.com/bmizerany/perks/quantile").
156 // Problem with the sliding-window decay algorithm... The Merge method of
157 // perk/quantile is actually not working as advertised - and it might be
158 // unfixable, as the underlying algorithm is apparently not capable of merging
159 // summaries in the first place. To avoid using Merge, we are currently adding
160 // observations to _each_ age bucket, i.e. the effort to add a sample is
161 // essentially multiplied by the number of age buckets. When rotating age
162 // buckets, we empty the previous head stream. On scrape time, we simply take
163 // the quantiles from the head stream (no merging required). Result: More effort
164 // on observation time, less effort on scrape time, which is exactly the
165 // opposite of what we try to accomplish, but at least the results are correct.
167 // The quite elegant previous contraption to merge the age buckets efficiently
168 // on scrape time (see code up commit 6b9530d72ea715f0ba612c0120e6e09fbf1d49d0)
169 // can't be used anymore.
171 // NewSummary creates a new Summary based on the provided SummaryOpts.
172 func NewSummary(opts SummaryOpts) Summary {
175 BuildFQName(opts.Namespace, opts.Subsystem, opts.Name),
184 func newSummary(desc *Desc, opts SummaryOpts, labelValues ...string) Summary {
185 if len(desc.variableLabels) != len(labelValues) {
186 panic(makeInconsistentCardinalityError(desc.fqName, desc.variableLabels, labelValues))
189 for _, n := range desc.variableLabels {
190 if n == quantileLabel {
191 panic(errQuantileLabelNotAllowed)
194 for _, lp := range desc.constLabelPairs {
195 if lp.GetName() == quantileLabel {
196 panic(errQuantileLabelNotAllowed)
200 if opts.Objectives == nil {
201 opts.Objectives = DefObjectives
205 panic(fmt.Errorf("illegal max age MaxAge=%v", opts.MaxAge))
207 if opts.MaxAge == 0 {
208 opts.MaxAge = DefMaxAge
211 if opts.AgeBuckets == 0 {
212 opts.AgeBuckets = DefAgeBuckets
215 if opts.BufCap == 0 {
216 opts.BufCap = DefBufCap
219 if len(opts.Objectives) == 0 {
220 // Use the lock-free implementation of a Summary without objectives.
221 s := &noObjectivesSummary{
223 labelPairs: makeLabelPairs(desc, labelValues),
224 counts: [2]*summaryCounts{&summaryCounts{}, &summaryCounts{}},
226 s.init(s) // Init self-collection.
233 objectives: opts.Objectives,
234 sortedObjectives: make([]float64, 0, len(opts.Objectives)),
236 labelPairs: makeLabelPairs(desc, labelValues),
238 hotBuf: make([]float64, 0, opts.BufCap),
239 coldBuf: make([]float64, 0, opts.BufCap),
240 streamDuration: opts.MaxAge / time.Duration(opts.AgeBuckets),
242 s.headStreamExpTime = time.Now().Add(s.streamDuration)
243 s.hotBufExpTime = s.headStreamExpTime
245 for i := uint32(0); i < opts.AgeBuckets; i++ {
246 s.streams = append(s.streams, s.newStream())
248 s.headStream = s.streams[0]
250 for qu := range s.objectives {
251 s.sortedObjectives = append(s.sortedObjectives, qu)
253 sort.Float64s(s.sortedObjectives)
255 s.init(s) // Init self-collection.
259 type summary struct {
262 bufMtx sync.Mutex // Protects hotBuf and hotBufExpTime.
263 mtx sync.Mutex // Protects every other moving part.
264 // Lock bufMtx before mtx if both are needed.
268 objectives map[float64]float64
269 sortedObjectives []float64
271 labelPairs []*dto.LabelPair
276 hotBuf, coldBuf []float64
278 streams []*quantile.Stream
279 streamDuration time.Duration
280 headStream *quantile.Stream
282 headStreamExpTime, hotBufExpTime time.Time
285 func (s *summary) Desc() *Desc {
289 func (s *summary) Observe(v float64) {
291 defer s.bufMtx.Unlock()
294 if now.After(s.hotBufExpTime) {
297 s.hotBuf = append(s.hotBuf, v)
298 if len(s.hotBuf) == cap(s.hotBuf) {
303 func (s *summary) Write(out *dto.Metric) error {
304 sum := &dto.Summary{}
305 qs := make([]*dto.Quantile, 0, len(s.objectives))
309 // Swap bufs even if hotBuf is empty to set new hotBufExpTime.
310 s.swapBufs(time.Now())
314 sum.SampleCount = proto.Uint64(s.cnt)
315 sum.SampleSum = proto.Float64(s.sum)
317 for _, rank := range s.sortedObjectives {
319 if s.headStream.Count() == 0 {
322 q = s.headStream.Query(rank)
324 qs = append(qs, &dto.Quantile{
325 Quantile: proto.Float64(rank),
326 Value: proto.Float64(q),
333 sort.Sort(quantSort(qs))
338 out.Label = s.labelPairs
342 func (s *summary) newStream() *quantile.Stream {
343 return quantile.NewTargeted(s.objectives)
346 // asyncFlush needs bufMtx locked.
347 func (s *summary) asyncFlush(now time.Time) {
351 // Unblock the original goroutine that was responsible for the mutation
352 // that triggered the compaction. But hold onto the global non-buffer
353 // state mutex until the operation finishes.
360 // rotateStreams needs mtx AND bufMtx locked.
361 func (s *summary) maybeRotateStreams() {
362 for !s.hotBufExpTime.Equal(s.headStreamExpTime) {
365 if s.headStreamIdx >= len(s.streams) {
368 s.headStream = s.streams[s.headStreamIdx]
369 s.headStreamExpTime = s.headStreamExpTime.Add(s.streamDuration)
373 // flushColdBuf needs mtx locked.
374 func (s *summary) flushColdBuf() {
375 for _, v := range s.coldBuf {
376 for _, stream := range s.streams {
382 s.coldBuf = s.coldBuf[0:0]
383 s.maybeRotateStreams()
386 // swapBufs needs mtx AND bufMtx locked, coldBuf must be empty.
387 func (s *summary) swapBufs(now time.Time) {
388 if len(s.coldBuf) != 0 {
389 panic("coldBuf is not empty")
391 s.hotBuf, s.coldBuf = s.coldBuf, s.hotBuf
392 // hotBuf is now empty and gets new expiration set.
393 for now.After(s.hotBufExpTime) {
394 s.hotBufExpTime = s.hotBufExpTime.Add(s.streamDuration)
398 type summaryCounts struct {
399 // sumBits contains the bits of the float64 representing the sum of all
400 // observations. sumBits and count have to go first in the struct to
401 // guarantee alignment for atomic operations.
402 // http://golang.org/pkg/sync/atomic/#pkg-note-BUG
407 type noObjectivesSummary struct {
408 // countAndHotIdx is a complicated one. For lock-free yet atomic
409 // observations, we need to save the total count of observations again,
410 // combined with the index of the currently-hot counts struct, so that
411 // we can perform the operation on both values atomically. The least
412 // significant bit defines the hot counts struct. The remaining 63 bits
413 // represent the total count of observations. This happens under the
414 // assumption that the 63bit count will never overflow. Rationale: An
415 // observations takes about 30ns. Let's assume it could happen in
416 // 10ns. Overflowing the counter will then take at least (2^63)*10ns,
417 // which is about 3000 years.
419 // This has to be first in the struct for 64bit alignment. See
420 // http://golang.org/pkg/sync/atomic/#pkg-note-BUG
421 countAndHotIdx uint64
425 writeMtx sync.Mutex // Only used in the Write method.
427 // Two counts, one is "hot" for lock-free observations, the other is
428 // "cold" for writing out a dto.Metric. It has to be an array of
429 // pointers to guarantee 64bit alignment of the histogramCounts, see
430 // http://golang.org/pkg/sync/atomic/#pkg-note-BUG.
431 counts [2]*summaryCounts
432 hotIdx int // Index of currently-hot counts. Only used within Write.
434 labelPairs []*dto.LabelPair
437 func (s *noObjectivesSummary) Desc() *Desc {
441 func (s *noObjectivesSummary) Observe(v float64) {
442 // We increment s.countAndHotIdx by 2 so that the counter in the upper
443 // 63 bits gets incremented by 1. At the same time, we get the new value
444 // back, which we can use to find the currently-hot counts.
445 n := atomic.AddUint64(&s.countAndHotIdx, 2)
446 hotCounts := s.counts[n%2]
449 oldBits := atomic.LoadUint64(&hotCounts.sumBits)
450 newBits := math.Float64bits(math.Float64frombits(oldBits) + v)
451 if atomic.CompareAndSwapUint64(&hotCounts.sumBits, oldBits, newBits) {
455 // Increment count last as we take it as a signal that the observation
457 atomic.AddUint64(&hotCounts.count, 1)
460 func (s *noObjectivesSummary) Write(out *dto.Metric) error {
463 hotCounts, coldCounts *summaryCounts
467 // For simplicity, we mutex the rest of this method. It is not in the
468 // hot path, i.e. Observe is called much more often than Write. The
469 // complication of making Write lock-free isn't worth it.
471 defer s.writeMtx.Unlock()
473 // This is a bit arcane, which is why the following spells out this if
474 // clause in English:
476 // If the currently-hot counts struct is #0, we atomically increment
477 // s.countAndHotIdx by 1 so that from now on Observe will use the counts
478 // struct #1. Furthermore, the atomic increment gives us the new value,
479 // which, in its most significant 63 bits, tells us the count of
480 // observations done so far up to and including currently ongoing
481 // observations still using the counts struct just changed from hot to
482 // cold. To have a normal uint64 for the count, we bitshift by 1 and
483 // save the result in count. We also set s.hotIdx to 1 for the next
484 // Write call, and we will refer to counts #1 as hotCounts and to counts
487 // If the currently-hot counts struct is #1, we do the corresponding
488 // things the other way round. We have to _decrement_ s.countAndHotIdx
489 // (which is a bit arcane in itself, as we have to express -1 with an
492 count = atomic.AddUint64(&s.countAndHotIdx, 1) >> 1
494 hotCounts = s.counts[1]
495 coldCounts = s.counts[0]
497 count = atomic.AddUint64(&s.countAndHotIdx, ^uint64(0)) >> 1 // Decrement.
499 hotCounts = s.counts[0]
500 coldCounts = s.counts[1]
503 // Now we have to wait for the now-declared-cold counts to actually cool
504 // down, i.e. wait for all observations still using it to finish. That's
505 // the case once the count in the cold counts struct is the same as the
506 // one atomically retrieved from the upper 63bits of s.countAndHotIdx.
508 if count == atomic.LoadUint64(&coldCounts.count) {
511 runtime.Gosched() // Let observations get work done.
514 sum.SampleCount = proto.Uint64(count)
515 sum.SampleSum = proto.Float64(math.Float64frombits(atomic.LoadUint64(&coldCounts.sumBits)))
518 out.Label = s.labelPairs
520 // Finally add all the cold counts to the new hot counts and reset the cold counts.
521 atomic.AddUint64(&hotCounts.count, count)
522 atomic.StoreUint64(&coldCounts.count, 0)
524 oldBits := atomic.LoadUint64(&hotCounts.sumBits)
525 newBits := math.Float64bits(math.Float64frombits(oldBits) + sum.GetSampleSum())
526 if atomic.CompareAndSwapUint64(&hotCounts.sumBits, oldBits, newBits) {
527 atomic.StoreUint64(&coldCounts.sumBits, 0)
534 type quantSort []*dto.Quantile
536 func (s quantSort) Len() int {
540 func (s quantSort) Swap(i, j int) {
541 s[i], s[j] = s[j], s[i]
544 func (s quantSort) Less(i, j int) bool {
545 return s[i].GetQuantile() < s[j].GetQuantile()
548 // SummaryVec is a Collector that bundles a set of Summaries that all share the
549 // same Desc, but have different values for their variable labels. This is used
550 // if you want to count the same thing partitioned by various dimensions
551 // (e.g. HTTP request latencies, partitioned by status code and method). Create
552 // instances with NewSummaryVec.
553 type SummaryVec struct {
557 // NewSummaryVec creates a new SummaryVec based on the provided SummaryOpts and
558 // partitioned by the given label names.
560 // Due to the way a Summary is represented in the Prometheus text format and how
561 // it is handled by the Prometheus server internally, “quantile” is an illegal
562 // label name. NewSummaryVec will panic if this label name is used.
563 func NewSummaryVec(opts SummaryOpts, labelNames []string) *SummaryVec {
564 for _, ln := range labelNames {
565 if ln == quantileLabel {
566 panic(errQuantileLabelNotAllowed)
570 BuildFQName(opts.Namespace, opts.Subsystem, opts.Name),
576 metricVec: newMetricVec(desc, func(lvs ...string) Metric {
577 return newSummary(desc, opts, lvs...)
582 // GetMetricWithLabelValues returns the Summary for the given slice of label
583 // values (same order as the VariableLabels in Desc). If that combination of
584 // label values is accessed for the first time, a new Summary is created.
586 // It is possible to call this method without using the returned Summary to only
587 // create the new Summary but leave it at its starting value, a Summary without
590 // Keeping the Summary for later use is possible (and should be considered if
591 // performance is critical), but keep in mind that Reset, DeleteLabelValues and
592 // Delete can be used to delete the Summary from the SummaryVec. In that case,
593 // the Summary will still exist, but it will not be exported anymore, even if a
594 // Summary with the same label values is created later. See also the CounterVec
597 // An error is returned if the number of label values is not the same as the
598 // number of VariableLabels in Desc (minus any curried labels).
600 // Note that for more than one label value, this method is prone to mistakes
601 // caused by an incorrect order of arguments. Consider GetMetricWith(Labels) as
602 // an alternative to avoid that type of mistake. For higher label numbers, the
603 // latter has a much more readable (albeit more verbose) syntax, but it comes
604 // with a performance overhead (for creating and processing the Labels map).
605 // See also the GaugeVec example.
606 func (v *SummaryVec) GetMetricWithLabelValues(lvs ...string) (Observer, error) {
607 metric, err := v.metricVec.getMetricWithLabelValues(lvs...)
609 return metric.(Observer), err
614 // GetMetricWith returns the Summary for the given Labels map (the label names
615 // must match those of the VariableLabels in Desc). If that label map is
616 // accessed for the first time, a new Summary is created. Implications of
617 // creating a Summary without using it and keeping the Summary for later use are
618 // the same as for GetMetricWithLabelValues.
620 // An error is returned if the number and names of the Labels are inconsistent
621 // with those of the VariableLabels in Desc (minus any curried labels).
623 // This method is used for the same purpose as
624 // GetMetricWithLabelValues(...string). See there for pros and cons of the two
626 func (v *SummaryVec) GetMetricWith(labels Labels) (Observer, error) {
627 metric, err := v.metricVec.getMetricWith(labels)
629 return metric.(Observer), err
634 // WithLabelValues works as GetMetricWithLabelValues, but panics where
635 // GetMetricWithLabelValues would have returned an error. Not returning an
636 // error allows shortcuts like
637 // myVec.WithLabelValues("404", "GET").Observe(42.21)
638 func (v *SummaryVec) WithLabelValues(lvs ...string) Observer {
639 s, err := v.GetMetricWithLabelValues(lvs...)
646 // With works as GetMetricWith, but panics where GetMetricWithLabels would have
647 // returned an error. Not returning an error allows shortcuts like
648 // myVec.With(prometheus.Labels{"code": "404", "method": "GET"}).Observe(42.21)
649 func (v *SummaryVec) With(labels Labels) Observer {
650 s, err := v.GetMetricWith(labels)
657 // CurryWith returns a vector curried with the provided labels, i.e. the
658 // returned vector has those labels pre-set for all labeled operations performed
659 // on it. The cardinality of the curried vector is reduced accordingly. The
660 // order of the remaining labels stays the same (just with the curried labels
661 // taken out of the sequence – which is relevant for the
662 // (GetMetric)WithLabelValues methods). It is possible to curry a curried
663 // vector, but only with labels not yet used for currying before.
665 // The metrics contained in the SummaryVec are shared between the curried and
666 // uncurried vectors. They are just accessed differently. Curried and uncurried
667 // vectors behave identically in terms of collection. Only one must be
668 // registered with a given registry (usually the uncurried version). The Reset
669 // method deletes all metrics, even if called on a curried vector.
670 func (v *SummaryVec) CurryWith(labels Labels) (ObserverVec, error) {
671 vec, err := v.curryWith(labels)
673 return &SummaryVec{vec}, err
678 // MustCurryWith works as CurryWith but panics where CurryWith would have
679 // returned an error.
680 func (v *SummaryVec) MustCurryWith(labels Labels) ObserverVec {
681 vec, err := v.CurryWith(labels)
688 type constSummary struct {
692 quantiles map[float64]float64
693 labelPairs []*dto.LabelPair
696 func (s *constSummary) Desc() *Desc {
700 func (s *constSummary) Write(out *dto.Metric) error {
701 sum := &dto.Summary{}
702 qs := make([]*dto.Quantile, 0, len(s.quantiles))
704 sum.SampleCount = proto.Uint64(s.count)
705 sum.SampleSum = proto.Float64(s.sum)
707 for rank, q := range s.quantiles {
708 qs = append(qs, &dto.Quantile{
709 Quantile: proto.Float64(rank),
710 Value: proto.Float64(q),
715 sort.Sort(quantSort(qs))
720 out.Label = s.labelPairs
725 // NewConstSummary returns a metric representing a Prometheus summary with fixed
726 // values for the count, sum, and quantiles. As those parameters cannot be
727 // changed, the returned value does not implement the Summary interface (but
728 // only the Metric interface). Users of this package will not have much use for
729 // it in regular operations. However, when implementing custom Collectors, it is
730 // useful as a throw-away metric that is generated on the fly to send it to
731 // Prometheus in the Collect method.
733 // quantiles maps ranks to quantile values. For example, a median latency of
734 // 0.23s and a 99th percentile latency of 0.56s would be expressed as:
735 // map[float64]float64{0.5: 0.23, 0.99: 0.56}
737 // NewConstSummary returns an error if the length of labelValues is not
738 // consistent with the variable labels in Desc or if Desc is invalid.
739 func NewConstSummary(
743 quantiles map[float64]float64,
744 labelValues ...string,
749 if err := validateLabelValues(labelValues, len(desc.variableLabels)); err != nil {
752 return &constSummary{
756 quantiles: quantiles,
757 labelPairs: makeLabelPairs(desc, labelValues),
761 // MustNewConstSummary is a version of NewConstSummary that panics where
762 // NewConstMetric would have returned an error.
763 func MustNewConstSummary(
767 quantiles map[float64]float64,
768 labelValues ...string,
770 m, err := NewConstSummary(desc, count, sum, quantiles, labelValues...)