Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

Current state: Under Discussion

ISSUE: #5955#6249 #7091 #7114 #7139

PRs: #5987 #7113 #7137

Keywords: metricProxy

Released:

...

service MilvusService {
rpc GetMetrics(GetMetricsRequest) returns (GetMetricsResponse) {}
}

message GetMetricsRequest {
// request is the jsonic format string, in this way, we can also extend request easier,
string request = 1;
}

message GetMetricResponse {
common.Status status = 1;
// response is the jsonic format string, in this way, we can also extend response easier
string response = 2;
}

I will describe how this interface should be used later in Design Details.

...

request:

{
   "metric_type": "system_info"
}

response:


Code Block
languagejs
firstline1
titlerepsonse
linenumberstrue
collapsetrue
{
  "nodes_info": [
    {
      "identifier": 1, // unique in the list of nodes_info
      "name": "root coordinator",
      "hardware_info": {
        "ip": "192.168.1.1",
        "cpu_core_count": 2,
        "cpu_core_usage": "10%",
        "memory": "13124124", 
        "memory_usage": "234123",
        "disk": "234123", 
        "disk_usage": "123123", 
      },
      "system_info": {
        "system_version": "rc2 a3c662c7b",
        "deploy_mode": "cluster",
      },
      "system_configurations": {
        "maxPartitionNum": 4096,
        "timeTickInterval": 200
      },
      "created_time": "2021-04-13 08:41:34.51+00",
      "updated_time": "2021-04-13 08:41:34.51+00",
      "type": "coordinator",
      "connected": []
    },
    {
      "identifier": 2,
      "name": "data coordinator",
      "hardware_info": {
        "ip": "192.168.1.1",
        "cpu_core_count": 2,
        "cpu_core_usage": "10%",
        "memory": "13124124", 
        "memory_usage": "234123", 
        "disk": "234123", 
        "disk_usage": "123123", 
      },
      "system_info": {
        "system_version": "rc2 a3c662c7b",
        "deploy_mode": "cluster",
      },
      "system_configurations": {
        "maxPartitionNum": 4096,
        "timeTickInterval": 200
      },
      "created_time": "2021-04-13 08:41:34.51+00",
      "updated_time": "2021-04-13 08:41:34.51+00",
      "type": "coordinator",
      "connected": [
        {
          "parent": 1,
          "method": "manage"
        }
      ]
    },
    {
      "identifier": 3,
      "name": "proxy",
      "hardware_info": {
        "ip": "192.168.1.1",
        "cpu_core_count": 2,
        "cpu_core_usage": "10%",
        "memory": "13124124", 
        "memory_usage": "234123", 
        "disk": "234123", 
        "disk_usage": "123123", 
      },
      "system_info": {
        "system_version": "rc2 a3c662c7b",
        "deploy_mode": "cluster",
      },
      "system_configurations": {
        "maxPartitionNum": 4096,
        "timeTickInterval": 200
      },
      "created_time": "2021-04-13 08:41:34.51+00",
      "updated_time": "2021-04-13 08:41:34.51+00",
      "type": "proxy",
      "connected": [
        {
          "parent": 1,
          "method": "notification"
        },
        {
          "parent": 2,
          "method": "notification"
        }
      ]
    },
    {
      "identifier": 4,
      "name": "data node 1",
      "hardware_info": {
        "ip": "192.168.1.1",
        "cpu_core_count": 2,
        "cpu_core_usage": "10%",
        "memory": "13124124", 
        "memory_usage": "234123", 
        "disk": "234123", 
        "disk_usage": "123123", 
      },
      "system_info": {
        "system_version": "rc2 a3c662c7b",
        "deploy_mode": "cluster",
      },
      "system_configurations": {
        "maxPartitionNum": 4096,
        "timeTickInterval": 200
      },
      "created_time": "2021-04-13 08:41:34.51+00",
      "updated_time": "2021-04-13 08:41:34.51+00",
      "type": "data node",
      "connected": [
        {
          "parent": 2,
          "method": "manage"
        }
      ]
    },
    {
      "identifier": 5,
      "name": "data node 2",
      "hardware_info": {
        "ip": "192.168.1.1",
        "cpu_core_count": 2,
        "cpu_core_usage": "10%",
        "memory": "13124124", 
        "memory_usage": "234123", 
        "disk": "234123", 
        "disk_usage": "123123", 
      },
      "system_info": {
        "system_version": "rc2 a3c662c7b",
        "deploy_mode": "cluster",
      },
      "system_configurations": {
        "maxPartitionNum": 4096,
        "timeTickInterval": 200
      },
      "created_time": "2021-04-13 08:41:34.51+00",
      "updated_time": "2021-04-13 08:41:34.51+00",
      "type": "data node",
      "connected": [
        {
          "parent": 2,
          "method": "manage"
        }
      ]
    }
  ]
}

...

System Statistics

{
   "metric_type": "system_statistics"
}

response:

{
   "hardware_statistics": [
      {
           "identifier": 1,    // unique in the list of hardware_statistics
           "name": "root coordinator",
           "hardware_usage": {
               "cpu": {
                   "type": "Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz",
                   "usage": 6
              },
               "memory": {
                   "total": 320000,    // in mega bytes
                   "usage": 120000,    // in mega bytes
              }
          }
      }
       // ...
  ],
   "loaded_collections": [
{
"name": "coll1",
"loaded_time": "2021/07/05 11:13:44.372 +08:00",
"loaded_partitions": [
{
"name": "partition1",
"loaded_time": "2021/07/05 11:13:44.372 +08:00",
},
// ...
]
},
// ...
],
"collection_count": 3,
"partitions_count": [
{
"coll1": 2,
},
// ...
],
"indexes_count": [
{
"coll1": 2,
},
// ...
],
"qps": 10096,
"latency": 0.1
}

System Event Log

{
   "metric_type": "system_log"
}

response:

{
   "dd": [
       "create collection 1 at ts1",
       "create collection 2 at ts2"
  ],
   "dm": [
       "insert 20 records into collection 1",
       "insert 30 records into collection 2"
  ],
   "dq": [
       "search on collection 1, nq: 10, topk = 5",
       "search on collection 2, nq: 10, topk = 5"
  ]
}

Test Plan

test script written with pymilvus:

#!/usr/bin/env python

import ujson

from pymilvus.grpc_gen import milvus_pb2 as milvus_types

ip = "127.0.0.1"
port = "19530"

if __name__ == "__main__":
   client = Milvus(host=ip, port=port)

   with client._connection() as handler:
       system_info_req = ujson.dumps({"metric_type": "system_info"})
       req = milvus_types.GetMetricsRequest(request=system_info_req)
       resp = handler._stub.GetMetrics(req, wait_for_ready=True, timeout=None)
       print(resp)

       system_statistics_req = ujson.dumps({"metric_type": "system_statistics"})
       req = milvus_types.GetMetricsRequest(request=system_statistics_req)
       resp = handler._stub.GetMetrics(req, wait_for_ready=True, timeout=None)
       print(resp)

       system_logs_req = ujson.dumps({"metric_type": "system_logs"})
       req = milvus_types.GetMetricsRequest(request=system_logs_req)
       resp = handler._stub.GetMetrics(req, wait_for_ready=True, timeout=None)
       print(resp)

   client.close()