freeleaps-ops/3rd/load-watcher/pkg/watcher/internal/metricsprovider/prometheus.go

320 lines
10 KiB
Go
Raw Normal View History

/*
Copyright 2020
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metricsprovider
import (
"context"
"crypto/tls"
"crypto/x509"
"fmt"
"io/ioutil"
"net"
"net/http"
"net/url"
"os"
"time"
"k8s.io/client-go/transport"
"github.com/paypal/load-watcher/pkg/watcher"
"github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/config"
"github.com/prometheus/common/model"
log "github.com/sirupsen/logrus"
_ "k8s.io/client-go/plugin/pkg/client/auth/oidc"
)
const (
EnableOpenShiftAuth = "ENABLE_OPENSHIFT_AUTH"
K8sPodCAFilePath = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
DefaultPromAddress = "http://prometheus-k8s:9090"
promStd = "stddev_over_time"
promAvg = "avg_over_time"
promCpuMetric = "instance:node_cpu:ratio"
promMemMetric = "instance:node_memory_utilisation:ratio"
promTransBandMetric = "instance:node_network_transmit_bytes:rate:sum"
promTransBandDropMetric = "instance:node_network_transmit_drop_excluding_lo:rate5m"
promRecBandMetric = "instance:node_network_receive_bytes:rate:sum"
promRecBandDropMetric = "instance:node_network_receive_drop_excluding_lo:rate5m"
promDiskIOMetric = "instance_device:node_disk_io_time_seconds:rate5m"
promScaphHostPower = "scaph_host_power_microwatts"
promScaphHostJoules = "scaph_host_energy_microjoules"
promKeplerHostCoreJoules = "kepler_node_core_joules_total"
promKeplerHostUncoreJoules = "kepler_node_uncore_joules_total"
promKeplerHostDRAMJoules = "kepler_node_dram_joules_total"
promKeplerHostPackageJoules = "kepler_node_package_joules_total"
promKeplerHostOtherJoules = "kepler_node_other_joules_total"
promKeplerHostGPUJoules = "kepler_node_gpu_joules_total"
promKeplerHostPlatformJoules = "kepler_node_platform_joules_total"
promKeplerHostEnergyStat = "kepler_node_energy_stat"
allHosts = "all"
hostMetricKey = "node"
)
type promClient struct {
client api.Client
promAddress string
}
func loadCAFile(filepath string) (*x509.CertPool, error) {
caCert, err := ioutil.ReadFile(filepath)
if err != nil {
return nil, err
}
caCertPool := x509.NewCertPool()
if ok := caCertPool.AppendCertsFromPEM(caCert); !ok {
return nil, fmt.Errorf("failed to append CA certificate to the pool")
}
return caCertPool, nil
}
func NewPromClient(opts watcher.MetricsProviderOpts) (watcher.MetricsProviderClient, error) {
if opts.Name != watcher.PromClientName {
return nil, fmt.Errorf("metric provider name should be %v, found %v", watcher.PromClientName, opts.Name)
}
var client api.Client
var err error
var promToken, promAddress = "", DefaultPromAddress
if opts.AuthToken != "" {
promToken = opts.AuthToken
}
if opts.Address != "" {
promAddress = opts.Address
}
// Ignore TLS verify errors if InsecureSkipVerify is set
roundTripper := api.DefaultRoundTripper
// Check if EnableOpenShiftAuth is set.
_, enableOpenShiftAuth := os.LookupEnv(EnableOpenShiftAuth)
if enableOpenShiftAuth {
// Retrieve Pod CA cert
caCertPool, err := loadCAFile(K8sPodCAFilePath)
if err != nil {
return nil, fmt.Errorf("Error loading CA file: %v", err)
}
// Get Prometheus Host
u, _ := url.Parse(opts.Address)
roundTripper = transport.NewBearerAuthRoundTripper(
opts.AuthToken,
&http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
TLSHandshakeTimeout: 10 * time.Second,
TLSClientConfig: &tls.Config{
RootCAs: caCertPool,
ServerName: u.Host,
},
},
)
} else if opts.InsecureSkipVerify {
roundTripper = &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
TLSHandshakeTimeout: 10 * time.Second,
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
}
}
if promToken != "" {
client, err = api.NewClient(api.Config{
Address: promAddress,
RoundTripper: config.NewAuthorizationCredentialsRoundTripper("Bearer", config.NewInlineSecret(opts.AuthToken), roundTripper),
})
} else {
client, err = api.NewClient(api.Config{
Address: promAddress,
})
}
if err != nil {
log.Errorf("error creating prometheus client: %v", err)
return nil, err
}
return promClient{
client: client,
promAddress: promAddress,
}, err
}
func (s promClient) Name() string {
return watcher.PromClientName
}
func (s promClient) FetchHostMetrics(host string, window *watcher.Window) ([]watcher.Metric, error) {
var metricList []watcher.Metric
var anyerr error
for _, method := range []string{promAvg, promStd} {
for _, metric := range []string{promCpuMetric, promMemMetric, promTransBandMetric, promTransBandDropMetric, promRecBandMetric, promRecBandDropMetric,
promDiskIOMetric, promScaphHostPower, promScaphHostJoules, promKeplerHostCoreJoules, promKeplerHostUncoreJoules, promKeplerHostDRAMJoules,
promKeplerHostPackageJoules, promKeplerHostOtherJoules, promKeplerHostGPUJoules, promKeplerHostPlatformJoules, promKeplerHostEnergyStat} {
promQuery := s.buildPromQuery(host, metric, method, window.Duration)
promResults, err := s.getPromResults(promQuery)
if err != nil {
log.Errorf("error querying Prometheus for query %v: %v\n", promQuery, err)
anyerr = err
continue
}
curMetricMap := s.promResults2MetricMap(promResults, metric, method, window.Duration)
metricList = append(metricList, curMetricMap[host]...)
}
}
return metricList, anyerr
}
// FetchAllHostsMetrics Fetch all host metrics with different operators (avg_over_time, stddev_over_time) and different resource types (CPU, Memory)
func (s promClient) FetchAllHostsMetrics(window *watcher.Window) (map[string][]watcher.Metric, error) {
hostMetrics := make(map[string][]watcher.Metric)
var anyerr error
for _, method := range []string{promAvg, promStd} {
for _, metric := range []string{promCpuMetric, promMemMetric, promTransBandMetric, promTransBandDropMetric, promRecBandMetric, promRecBandDropMetric,
promDiskIOMetric, promScaphHostPower, promScaphHostJoules, promKeplerHostCoreJoules, promKeplerHostUncoreJoules, promKeplerHostDRAMJoules,
promKeplerHostPackageJoules, promKeplerHostOtherJoules, promKeplerHostGPUJoules, promKeplerHostPlatformJoules, promKeplerHostEnergyStat} {
promQuery := s.buildPromQuery(allHosts, metric, method, window.Duration)
promResults, err := s.getPromResults(promQuery)
if err != nil {
log.Errorf("error querying Prometheus for query %v: %v\n", promQuery, err)
anyerr = err
continue
}
curMetricMap := s.promResults2MetricMap(promResults, metric, method, window.Duration)
for k, v := range curMetricMap {
// skip empty keys
if k == "" {
continue
}
hostMetrics[k] = append(hostMetrics[k], v...)
}
}
}
return hostMetrics, anyerr
}
func (s promClient) Health() (int, error) {
req, err := http.NewRequest("HEAD", s.promAddress, nil)
if err != nil {
return -1, err
}
resp, _, err := s.client.Do(context.Background(), req)
if err != nil {
return -1, err
}
if resp.StatusCode != http.StatusOK {
return -1, fmt.Errorf("received response status code: %v", resp.StatusCode)
}
return 0, nil
}
func (s promClient) buildPromQuery(host string, metric string, method string, rollup string) string {
var promQuery string
if host == allHosts {
promQuery = fmt.Sprintf("%s(%s[%s])", method, metric, rollup)
} else {
promQuery = fmt.Sprintf("%s(%s{%s=\"%s\"}[%s])", method, metric, hostMetricKey, host, rollup)
}
return promQuery
}
func (s promClient) getPromResults(promQuery string) (model.Value, error) {
v1api := v1.NewAPI(s.client)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
results, warnings, err := v1api.Query(ctx, promQuery, time.Now())
if err != nil {
return nil, err
}
if len(warnings) > 0 {
log.Warnf("Warnings: %v\n", warnings)
}
log.Debugf("result:\n%v\n", results)
return results, nil
}
func (s promClient) promResults2MetricMap(promresults model.Value, metric string, method string, rollup string) map[string][]watcher.Metric {
var metricType string
var operator string
curMetrics := make(map[string][]watcher.Metric)
switch metric {
case promCpuMetric: // CPU metrics
metricType = watcher.CPU
case promMemMetric: // Memory metrics
metricType = watcher.Memory
case promDiskIOMetric: // Storage metrics
metricType = watcher.Storage
case promScaphHostPower, promScaphHostJoules, // Energy-related metrics
promKeplerHostCoreJoules, promKeplerHostUncoreJoules,
promKeplerHostDRAMJoules, promKeplerHostPackageJoules,
promKeplerHostOtherJoules, promKeplerHostGPUJoules,
promKeplerHostPlatformJoules, promKeplerHostEnergyStat:
metricType = watcher.Energy
case promTransBandMetric, promTransBandDropMetric, // Bandwidth-related metrics
promRecBandMetric, promRecBandDropMetric:
metricType = watcher.Bandwidth
default:
metricType = watcher.Unknown
}
if method == promAvg {
operator = watcher.Average
} else if method == promStd {
operator = watcher.Std
} else {
operator = watcher.UnknownOperator
}
switch promresults.(type) {
case model.Vector:
for _, result := range promresults.(model.Vector) {
curMetric := watcher.Metric{Name: metric, Type: metricType, Operator: operator, Rollup: rollup, Value: float64(result.Value * 100)}
curHost := string(result.Metric[hostMetricKey])
curMetrics[curHost] = append(curMetrics[curHost], curMetric)
}
default:
log.Errorf("error: The Prometheus results should not be type: %v.\n", promresults.Type())
}
return curMetrics
}