pax_global_header00006660000000000000000000000064126526673230014526gustar00rootroot0000000000000052 comment=42187541f14333292b38d1f67611454331a8b7ea carbon-c-relay-1.7/000077500000000000000000000000001265266732300141735ustar00rootroot00000000000000carbon-c-relay-1.7/.gitignore000066400000000000000000000000131265266732300161550ustar00rootroot00000000000000*.o /relay carbon-c-relay-1.7/LICENSE.md000066400000000000000000000236761265266732300156150ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS carbon-c-relay-1.7/Makefile000066400000000000000000000030111265266732300156260ustar00rootroot00000000000000# Copyright 2013-2016 Fabian Groffen # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. CFLAGS ?= -O2 -Wall # if your compiler doesn't support OpenMP, comment out this line, or # define OPENMP_FLAGS to be empty OPENMP_FLAGS ?= -fopenmp override CC += $(OPENMP_FLAGS) GIT_VERSION := $(shell git describe --abbrev=6 --dirty --always || date +%F) GVCFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\" override CFLAGS += $(GVCFLAGS) -pthread SOCKET_LIBS = ifeq ($(shell uname), SunOS) SOCKET_LIBS += -lsocket -lnsl endif # should be accepted sort of anywhere MATH_LIBS = -lm override LIBS += $(SOCKET_LIBS) $(MATH_LIBS) -pthread OBJS = \ relay.o \ md5.o \ consistent-hash.o \ receptor.o \ dispatcher.o \ router.o \ queue.o \ server.o \ collector.o \ aggregator.o relay: $(OBJS) $(CC) -o $@ $(LDFLAGS) $^ $(LIBS) VERSION = $(shell sed -n '/VERSION/s/^.*"\([0-9.]\+\)".*$$/\1/p' relay.h) dist: git archive \ --format=tar.gz \ --prefix=carbon-c-relay-$(VERSION)/ v$(VERSION) \ > carbon-c-relay-$(VERSION).tar.gz clean: rm -f *.o relay carbon-c-relay-1.7/README.md000066400000000000000000000732631265266732300154650ustar00rootroot00000000000000carbon-c-relay ============== Carbon-like graphite line mode relay. This project aims to be a fast replacement of the original [Carbon relay](http://graphite.readthedocs.org/en/1.0/carbon-daemons.html#carbon-relay-py) The main reason to build a replacement is performance and configurability. Carbon is single threaded, and sending metrics to multiple consistent-hash clusters requires chaining of relays. This project provides a multithreaded relay which can address multiple targets and clusters for each and every metric based on pattern matches. There are a couple more replacement projects out there we know of, which are [carbon-relay-ng](https://github.com/graphite-ng/carbon-relay-ng) and [graphite-relay](https://github.com/markchadwick/graphite-relay ) Compared to carbon-relay-ng, this project does provide carbon's consistent-hash routing. graphite-relay, which does this, however doesn't do metric-based matches to direct the traffic, which this project does as well. To date, carbon-c-relay can do aggregations, failover targets and more. The relay is a simple program that reads its routing information from a file. The command line arguments allow to set the location for this file, as well as the amount of dispatchers (worker threads) to use for reading the data from incoming connections and passing them onto the right destination(s). The route file supports two main constructs: clusters and matches. The first define groups of hosts data metrics can be sent to, the latter define which metrics should be sent to which cluster. Aggregation rules are seen as matches. For every metric received by the relay, cleansing is performed. The following changes are performed before any match, aggregate or rewrite rule sees the metric: - double dot elimination (necessary for correctly functioning consistent hash routing) - trailing/leading dot elimination - whitespace normalisation (this mostly affects output of the relay to other targets: metric, value and timestamp will be separated by a single space only, ever) - irregular char replacement with underscores (\_), currently irregular is defined as not being in [0-9a-zA-Z-_:#], but can be overridden on the command line. The route file syntax is as follows: ``` # comments are allowed in any place and start with a hash (#) cluster [replication ]> ]> ... ; cluster file [ip] ... ; match <* | expression ...> send to [stop] ; rewrite into ; aggregate ... every seconds expire after seconds [timestamp at of bucket] compute | variance | stddev> write to [compute ...] [send to ] [stop] ; ``` Multiple clusters can be defined, and need not to be referenced by a match rule. All clusters point to one or more hosts, except the `file` cluster which writes to files in the local filesystem. `host` may be an IPv4 or IPv6 address, or a hostname. Since host is followed by an optional `:` and port, for IPv6 addresses not to be interpreted wrongly, either a port must be given, or the IPv6 address surrounded by brackets, e.g. `[::1]`. An optional `proto udp` or `proto tcp` may be added to specify the use of UDP or TCP to connect to the remote server. When omitted this defaults to a TCP connection. The `forward` and `file` clusters simply send everything they receive to all defined members (host addresses or files). The `any_of` cluster is a small variant of the `forward` cluster, but instead of sending to all defined members, it sends each incoming metric to one of defined members. This is not much useful in itself, but since any of the members can receive each metric, this means that when one of the members is unreachable, the other members will receive all of the metrics. This can be useful when the cluster points to other relays. The `any_of` router tries to send the same metrics consistently to the same destination. The `failover` cluster is like the `any_of` cluster, but sticks to the order in which servers are defined. This is to implement a pure failover scenario between servers. The `carbon_ch` cluster sends the metrics to the member that is responsible according to the consistent hash algorithm (as used in the original carbon), or multiple members if replication is set to more than 1. The `fnv1a_ch` cluster is a identical in behaviour to `carbon_ch`, but it uses a different hash technique (FNV1a) which is faster but more importantly defined to get by a limitation of `carbon_ch` to use both host and port from the members. This is useful when multiple targets live on the same host just separated by port. The instance that original carbon uses to get around this can be set by appending it after the port, separated by an equals sign, e.g. `127.0.0.1:2006=a` for instance `a`. When using the `fnv1a_ch` cluster, this instance overrides the hash key in use. This allows for many things, including masquerading old IP addresses, but mostly to make the hash key location to become agnostic of the (physical) location of that key. For example, usage like `10.0.0.1:2003=4d79d13554fa1301476c1f9fe968b0ac` would allow to change port and/or ip address of the server that receives data for the instance key. Obviously, this way migration of data can be dealt with much more conveniently. The `jump_fnv1a_ch` cluster is also a consistent hash cluster like the previous two, but it does not take the server information into account at all. Whether this is useful to you depends on your scenario. The jump hash has a much better balancing over the servers defined in the cluster, at the expense of not being able to remove any server but the last in order. What this means is that this hash is fine to use with ever growing clusters where older nodes are also replaced at some point. If you have a cluster where removal of old nodes takes place often, the jump hash is not suitable for you. Jump hash works with servers in an ordered list without gaps. To influence the ordering, the instance given to the server will be used as sorting key. Without, the order will be as given in the file. It is a good practice to fix the order of the servers with instances such that it is explicit what the right nodes for the jump hash are. DNS hostnames are resolved to a single address, according to the preference rules in [RFC 3484](https://www.ietf.org/rfc/rfc3484.txt). The `any_of` cluster has an explicit `useall` flag that enables a hostname to resolve to multiple addresses. Each address returned becomes a cluster destination. Match rules are the way to direct incoming metrics to one or more clusters. Match rules are processed top to bottom as they are defined in the file. It is possible to define multiple matches in the same rule. Each match rule can send data to one or more clusters. Since match rules "fall through" unless the `stop` keyword is added, carefully crafted match expression can be used to target multiple clusters or aggregations. This ability allows to replicate metrics, as well as send certain metrics to alternative clusters with careful ordering and usage of the `stop` keyword. The special cluster `blackhole` discards any metrics sent to it. This can be useful for weeding out unwanted metrics in certain cases. Because throwing metrics away is pointless if other matches would accept the same data, a match with as destination the blackhole cluster, has an implicit `stop`. Rewrite rules take a regular input to match incoming metrics, and transform them into the desired new metric name. In the replacement, backreferences are allowed to match capture groups defined in the input regular expression. A match of `server\.(x|y|z)\.` allows to use e.g. `role.\1.` in the substitution. A few caveats apply to the current implementation of rewrite rules. First, their location in the config file determines when the rewrite is performed. The rewrite is done in-place, as such a match rule before the rewrite would match the original name, a match rule after the rewrite no longer matches the original name. Care should be taken with the ordering, as multiple rewrite rules in succession can take place, e.g. `a` gets replaced by `b` and `b` gets replaced by `c` in a succeeding rewrite rule. The second caveat with the current implementation, is that the rewritten metric names are not cleansed, like newly incoming metrics are. Thus, double dots and potential dangerous characters can appear if the replacement string is crafted to produce them. It is the responsibility of the writer to make sure the metrics are clean. If this is an issue for routing, one can consider to have a rewrite-only instance that forwards all metrics to another instance that will do the routing. Obviously the second instance will cleanse the metrics as they come in. The backreference notation allows to lowercase and uppercase the replacement string with the use of the underscore (`_`) and carret (`^`) symbols following directly after the backslash. For example, `role.\_1.` as substitution will lowercase the contents of `\1`. The aggregations defined take one or more input metrics expressed by one or more regular expresions, similar to the match rules. Incoming metrics are aggregated over a period of time defined by the interval in seconds. Since events may arrive a bit later in time, the expiration time in seconds defines when the aggregations should be considered final, as no new entries are allowed to be added any more. On top of an aggregation multiple aggregations can be computed. They can be of the same or different aggregation types, but should write to a unique new metric. The metric names can include back references like in rewrite expressions, allowing for powerful single aggregation rules that yield in many aggregations. When no `send to` clause is given, produced metrics are sent to the relay as if they were submitted from the outside, hence match and aggregation rules apply to those. Care should be taken that loops are avoided this way. For this reason, the use of the `send to` clause is encouraged, to direct the output traffic where possible. Like for match rules, it is possible to define multiple cluster targets. Also, like match rules, the `stop` keyword applies to control the flow of metrics in the matching process. Examples -------- Carbon-c-relay evolved over time, growing features on demand as the tool proved to be stable and fitting the job well. Below follow some annotated examples of constructs that can be used with the relay. Clusters can be defined as much as necessary. They receive data from match rules, and their type defines which members of the cluster finally get the metric data. The simplest cluster form is a `forward` cluster: cluster send-through forward 10.1.0.1 ; Any metric sent to the `send-through` cluster would simply be forwarded to the server at IPv4 address `10.1.0.1`. If we define multiple servers, all of those servers would get the same metric, thus: cluster send-through forward 10.1.0.1 10.2.0.1 ; The above results in a duplication of metrics send to both machines. This can be useful, but most of the time it is not. The `any_of` cluster type is like `forward`, but it sends each incoming metric to any of the members. The same example with such cluster would be: cluster send-to-any-one any_of 10.1.0.1:2010 10.1.0.1:2011; This would implement a multipath scenario, where two servers are used, the load between them is spread, but should any of them fail, all metrics are sent to the remaining one. This typically works well for upstream relays, or for balancing carbon-cache processes running on the same machine. Should any member become unavailable, for instance due to a rolling restart, the other members receive the traffic. If it is necessary to have true fail-over, where the secondary server is only used if the first is down, the following would implement that: cluster try-first-then-second failover 10.1.0.1:2010 10.1.0.1:2011; These types are different from the two consistent hash cluster types: cluster graphite carbon_ch 127.0.0.1:2006=a 127.0.0.1:2007=b 127.0.0.1:2008=c ; If a member in this example fails, all metrics that would go to that member are kept in the queue, waiting for the member to return. This is useful for clusters of carbon-cache machines where it is desirable that the same metric ends up on the same server always. The `carbon_ch` cluster type is compatible with carbon-relay consistent hash, and can be used for existing clusters populated by carbon-relay. For new clusters, however, it is better to use the `fnv1a_ch` cluster type, for it is faster, and allows to balance over the same address but different ports without an instance number, in constrast to `carbon_ch`. Because we can use multiple clusters, we can also replicate without the use of the `forward` cluster type, in a more intelligent way: cluster dc-old carbon_ch replication 2 10.1.0.1 10.1.0.2 10.1.0.3 ; cluster dc-new1 fnv1a_ch replication 2 10.2.0.1 10.2.0.2 10.2.0.3 ; cluster dc-new2 fnv1a_ch replication 2 10.3.0.1 10.3.0.2 10.3.0.3 ; match * send to dc-old ; match * send to dc-new1 dc-new2 stop ; In this example all incoming metrics are first sent to `dc-old`, then `dc-new1` and finally to `dc-new2`. Note that the cluster type of `dc-old` is different. Each incoming metric will be send to 2 members of all three clusters, thus replicating to in total 6 destinations. For each cluster the destination members are computed independently. Failure of clusters or members does not affect the others, since all have individual queues. The above example could also be written using three match rules for each dc, or one match rule for all three dcs. The difference is mainly in performance, the number of times the incoming metric has to be matched against an expression. The `stop` rule in `dc-new` match rule is not strictly necessary in this example, because there are no more following match rules. However, if the match would target a specific subset, e.g. `^sys\.`, and more clusters would be defined, this could be necessary, as for instance in the following abbreviated example: cluster dc1-sys ... ; cluster dc2-sys ... ; cluster dc1-misc ... ; cluster dc2-misc ... ; match ^sys\. send to dc1-sys; match ^sys\. send to dc2-sys stop; match * send to dc1-misc; match * send to dc2-misc stop; As can be seen, without the `stop` in dc2-sys' match rule, all metrics starting with `sys.` would also be send to dc1-misc and dc2-misc. It can be that this is desired, of course, but in this example there is a dedicated cluster for the `sys` metrics. Suppose there would be some unwanted metric that unfortunately is generated, let's assume some bad/old software. We don't want to store this metric. The `blackhole` cluster is suitable for that, when it is harder to actually whitelist all wanted metrics. Consider the following: match some_legacy1$ some_legacy2$ send to blackhole stop; This would throw away all metrics that end with `some_legacy`, that would otherwise be hard to filter out. Since the order matters, it can be used in a construct like this: cluster old ... ; cluster new ... ; match * send to old; match unwanted send to blackhole stop; match * send to new; In this example the old cluster would receive the metric that's unwanted for the new cluster. So, the order in which the rules occur does matter for the execution. The relay is capable of rewriting incoming metrics on the fly. This process is done based on regular expressions with capture groups that allow to substitute parts in a replacement string. Rewrite rules allow to cleanup metrics from applications, or provide a migration path. In it's simplest form a rewrite rule looks like this: rewrite ^server\.(.+)\.(.+)\.([a-zA-Z]+)([0-9]+) into server.\_1.\2.\3.\3\4 ; In this example a metric like `server.DC.role.name123` would be transformed into `server.dc.role.name.name123`. For rewrite rules hold the same as for matches, that their order matters. Hence to build on top of the old/new cluster example done earlier, the following would store the original metric name in the old cluster, and the new metric name in the new cluster: match * send to old; rewrite ... ; match * send to new; Note that after the rewrite, the original metric name is no longer available, as the rewrite happens in-place. Aggregations are probably the most complex part of carbon-c-relay. Two ways of specifying aggregates are supported by carbon-c-relay. The first, static rules, are handled by an optimiser which tries to fold thousands of rules into groups to make the matching more efficient. The second, dynamic rules, are very powerful compact definitions with possibly thousands of internal instantiations. A typical static aggregation looks like: aggregate ^sys\.dc1\.somehost-[0-9]+\.somecluster\.mysql\.replication_delay ^sys\.dc2\.somehost-[0-9]+\.somecluster\.mysql\.replication_delay every 10 seconds expire after 35 seconds timestamp at end of bucket compute sum write to mysql.somecluster.total_replication_delay compute average write to mysql.somecluster.average_replication_delay compute max write to mysql.somecluster.max_replication_delay compute count write to mysql.somecluster.replication_delay_metric_count ; In this example, four aggregations are produced from the incoming matching metrics. In this example we could have written the two matches as one, but for demonstration purposes we did not. Obviously they can refer to different metrics, if that makes sense. The `every 10 seconds` clause specifies in what interval the aggregator can expect new metrics to arrive. This interval is used to produce the aggregations, thus each 10 seconds 4 new metrics are generated from the data received sofar. Because data may be in transit for some reason, or generation stalled, the `expire after` clause specifies how long the data should be kept before considering a data bucket (which is aggregated) to be complete. In the example, 35 was used, which means after 35 seconds the first aggregates are produced. It also means that metrics can arrive 35 seconds late, and still be taken into account. The exact time at which the aggregate metrics are produced is random between 0 and interval (10 in this case) seconds after the expiry time. This is done to prevent thundering herds of metrics for large aggregation sets. The `timestamp` that is used for the aggregations can be specified to be the `start`, `middle` or `end` of the bucket. Original carbon-aggregator.py uses `start`, while carbon-c-relay's default has always been `end`. The `compute` clauses demonstrate a single aggregation rule can produce multiple aggregates, as often is the case. Internally, this comes for free, since all possible aggregates are always calculated, whether or not they are used. The produced new metrics are resubmitted to the relay, hence matches defined before in the configuration can match output of the aggregator. It is important to avoid loops, that can be generated this way. In general, splitting aggregations to their own carbon-c-relay instance, such that it is easy to forward the produced metrics to another relay instance is a good practice. The previous example could also be written as follows to be dynamic: aggregate ^sys\.dc[0-9].(somehost-[0-9]+)\.([^.]+)\.mysql\.replication_delay every 10 seconds expire after 35 seconds compute sum write to mysql.host.\1.replication_delay compute sum write to mysql.host.all.replication_delay compute sum write to mysql.cluster.\2.replication_delay compute sum write to mysql.cluster.all.replication_delay ; Here a single match, results in four aggregations, each of a different scope. In this example aggregation based on hostname and cluster are being made, as well as the more general `all` targets, which in this example have both identical values. Note that with this single aggregation rule, both per-cluster, per-host and total aggregations are produced. Obviously, the input metrics define which hosts and clusters are produced. With use of the `send to` clause, aggregations can be made more intuitive and less error-prone. Consider the below example: cluster graphite fnv1a_ch ip1 ip2 ip3; aggregate ^sys\.somemetric every 60 seconds expire after 75 seconds compute sum write to sys.somemetric send to graphite stop ; match * send to graphite; It sends all incoming metrics to the graphite cluster, except the sys.somemetric ones, which it replaces with a sum of all the incoming ones. Without a `stop` in the aggregate, this causes a loop, and without the `send to`, the metric name can't be kept its original name, for the output now directly goes to the cluster. Statistics ---------- When carbon-c-relay is run without `-d` or `-s` arguments, statistics will be produced and sent to the relay itself in the form of `carbon.relays..*`. The hostname is determined on startup, and can be overriden using the `-H` argument. While many metrics have a similar name to what carbon-cache.py would produce, their values are different. To obtain a more compatible set of values, the `-m` argument can be used to make values non-cumulative, that is, they will report the change compared to the previous value. By default, most values are running counters which only increase over time. The use of the nonNegativeDerivative() function from graphite is useful with these. The default sending interval is 1 minute (60 seconds), but can be overridden using the `-S` argument specified in seconds. The following metrics are produced in the `carbon.relays.` namespace: * metricsReceived The number of metrics that were received by the relay. Received here means that they were seen and processed by any of the dispatchers. * metricsSent The number of metrics that were sent from the relay. This is a total count for all servers combined. When incoming metrics are duplicated by the cluster configuration, this counter will include all those duplications. In other words, the amount of metrics that were successfully sent to other systems. Note that metrics that are processed (received) but still in the sending queue (queued) are not included in this counter. * metricsQueued The total number of metrics that are currently in the queues for all the server targets. This metric is not cumulative, for it is a sample of the queue size, which can (and should) go up and down. Therefore you should not use the derivative function for this metric. * metricsDropped The total number of metric that had to be dropped due to server queues overflowing. A queue typically overflows when the server it tries to send its metrics to is not reachable, or too slow in ingesting the amount of metrics queued. This can be network or resource related, and also greatly depends on the rate of metrics being sent to the particular server. * metricsBlackholed The number of metrics that did not match any rule, or matched a rule with blackhole as target. Depending on your configuration, a high value might be an indication of a misconfiguration somewhere. These metrics were received by the relay, but never sent anywhere, thus they disappeared. * metricStalls The number of times the relay had to stall a client to indicate that the downstream server cannot handle the stream of metrics. A stall is only performed when the queue is full and the server is actually receptive of metrics, but just too slow at the moment. Stalls typically happen during micro-bursts, where the client typically is unaware that it should stop sending more data, while it is able to. * connections The number of connect requests handled. This is an ever increasing number just counting how many connections were accepted. * disconnects The number of disconnected clients. A disconnect either happens because the client goes away, or due to an idle timeout in the relay. The difference between this metric and connections is the amount of connections actively held by the relay. In normal situations this amount remains within reasonable bounds. Many connections, but few disconnections typically indicate a possible connection leak in the client. The idle connections disconnect in the relay here is to guard against resource drain in such scenarios. * dispatch\_busy The number of dispatchers actively doing work at the moment of the sample. This is just an indication of the work pressure on the relay. * dispatch\_idle The number of dispatchers sleeping at the moment of the sample. When this number nears 0, dispatch\_busy should be high. When the configured number of worker threads is low, this might mean more worker threads should be added (if the system allows it) or the relay is reaching its limits with regard to how much it can process. A relay with no idle dispatchers will likely appear slow for clients, for the relay has too much work to serve them instantly. * dispatch\_wallTime\_us The number of microseconds spent by the dispatchers to do their work. In particular on multi-core systems, this value can be confusing, however, it indicates how long the dispatchers were doing work handling clients. It includes everything they do, from reading data from a socket, cleaning up the input metric, to adding the metric to the appropriate queues. The larger the configuration, and more complex in terms of matches, the more time the dispatchers will spend on the cpu. * server\_wallTime\_us The number of microseconds spent by the servers to send the metrics from their queues. This value includes connection creation, reading from the queue, and sending metrics over the network. * dispatcherX For each indivual dispatcher, the metrics received and blackholed plus the wall clock time. The values are as described above. * destinations.X For all known destinations, the number of dropped, queued and sent metrics plus the wall clock time spent. The values are as described above. * aggregators.metricsReceived The number of metrics that were matched an aggregator rule and were accepted by the aggregator. When a metric matches multiple aggregators, this value will reflect that. A metric is not counted when it is considered syntactically invalid, e.g. no value was found. * aggregators.metricsDropped The number of metrics that were sent to an aggregator, but did not fit timewise. This is either because the metric was too far in the past or future. The expire after clause in aggregate statements controls how long in the past metric values are accepted. * aggregators.metricsSent The number of metrics that were sent from the aggregators. These metrics were produced and are the actual results of aggregations. Performance ----------- The original argument for building carbon-c-relay was speed, with configurablility following close. To date, performance has bypassed the original carbon-relay.py by orders of magnitude, but the actual speed highly depends on perception and scenario. What follows below are some rough numbers about the environment at Booking.com where carbon-c-relay is used extensively in production. carbon-c-relay runs on all of our machines as a local submission relay. Its config is simply a match all to a `any_of` cluster with a number of upstream relays to try and send the metrics to. These relays run with 4 workers, and receive a minimal amount of metrics per minute, typically between 50 and 200. These instances take typically around 19MiB of RAM and consume at top 0.8% CPU of a 2.4GHz core. The minimal footprint of the relay is a desired property for running on all of our machines. The main relays we run, have roughly 20 clusters defined with `fnv1a_ch` hash. Average clustersize around 10 members. On top of that 30 match rules are defined. For a mildly-loaded relay receiving 1M metrics per minute, the relay consumes 750MiB of RAM and needs around 40% of a 2.4GHz core. A relay with more load but the same configuration, 3M metrics per minute, needs almost 2GiB of RAM, and some 45% CPU of a 2.4GHz core. The memory usage is mainly in the buffers for writing to the server stores. On the stores, we run relays with a simple config with a match all rule to an `any_of` cluster pointing to 13 locally running carbon-cache.py instances. These relays receive up to 1.7M metrics per minute, and require some 110MiB RAM for that. The CPU usage is around 15% of a 2.4GHz core. For aggregations we don't do much traffic (55K per minute) on a couple of aggregations expanding to a thousand of metrics. In our setup this takes 30MiB of RAM usage with some 30% CPU usage. Author ------ Fabian Groffen Acknowledgement --------------- This program was originally developed for Booking.com. With approval from Booking.com, the code was generalised and published as Open Source on github, for which the author would like to express his gratitude. carbon-c-relay-1.7/aggregator.c000066400000000000000000000426251265266732300164720ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include "relay.h" #include "dispatcher.h" #include "server.h" #include "router.h" #include "aggregator.h" #include "fnv1a.h" static pthread_t aggregatorid; static size_t prevreceived = 0; static size_t prevsent = 0; static size_t prevdropped = 0; static char keep_running = 1; /** * Allocates a new aggregator setup to hold buckets matching interval * and expiry time. */ aggregator * aggregator_new( unsigned int interval, unsigned int expire, enum _aggr_timestamp tswhen) { aggregator *ret = malloc(sizeof(aggregator)); int intconn[2]; if (ret == NULL) return ret; assert(interval != 0); assert(interval < expire); if (pipe(intconn) < 0) { logerr("failed to create pipe for aggregator: %s\n", strerror(errno)); free(ret); return NULL; } ret->disp_conn = dispatch_addconnection_aggr(intconn[0]); ret->fd = intconn[1]; ret->interval = interval; ret->expire = expire; ret->tswhen = tswhen; ret->bucketcnt = (expire + (interval - 1)) / interval + 1 + 1; ret->received = 0; ret->sent = 0; ret->dropped = 0; ret->computes = NULL; ret->next = NULL; pthread_mutex_init(&ret->bucketlock, NULL); return ret; } /** * Adds a new compute part to this aggregator. Returns -1 if type is * not a recognised aggregation type. */ char aggregator_add_compute( aggregator *s, const char *metric, const char *type) { struct _aggr_computes *ac = s->computes; enum _aggr_compute_type act; char store = 0; int pctl = 0; if (strcmp(type, "sum") == 0) { act = SUM; } else if (strcmp(type, "count") == 0 || strcmp(type, "cnt") == 0) { act = CNT; } else if (strcmp(type, "max") == 0) { act = MAX; } else if (strcmp(type, "min") == 0) { act = MIN; } else if (strcmp(type, "average") == 0 || strcmp(type, "avg") == 0) { act = AVG; } else if (strcmp(type, "median") == 0) { act = MEDN; pctl = 50; store = 1; } else if (strncmp(type, "percentile", strlen("percentile")) == 0) { pctl = atoi(type + strlen("percentile")); if (pctl > 100 || pctl <= 0) { return -1; } else { act = PCTL; store = 1; } } else if (strcmp(type, "variance") == 0) { act = VAR; store = 1; } else if (strcmp(type, "stddev") == 0) { act = SDEV; store = 1; } else { return -1; } if (ac == NULL) { ac = s->computes = malloc(sizeof(*ac)); } else { while (ac->next != NULL) ac = ac->next; ac = ac->next = malloc(sizeof(*ac)); } ac->type = act; ac->percentile = (unsigned char)pctl; ac->metric = strdup(metric); memset(ac->invocations_ht, 0, sizeof(ac->invocations_ht)); ac->entries_needed = store; ac->next = NULL; return 0; } void aggregator_set_stub( aggregator *s, const char *stubname) { struct _aggr_computes *ac; char newmetric[METRIC_BUFSIZ]; for (ac = s->computes; ac != NULL; ac = ac->next) { snprintf(newmetric, sizeof(newmetric), "%s%s", stubname, ac->metric); free((void *)ac->metric); ac->metric = strdup(newmetric); } } /** * Adds a new metric to aggregator s. The value from the metric is put * in the bucket matching the epoch contained in the metric. In cases * where the contained epoch is too old or too new, the metric is * dropped. */ void aggregator_putmetric( aggregator *s, const char *metric, const char *firstspace, size_t nmatch, regmatch_t *pmatch) { char *v; double val; long long int epoch; long long int itime; int slot; char newmetric[METRIC_BUFSIZ]; char *newfirstspace = NULL; size_t len; const char *ometric; const char *omp; unsigned int omhash; unsigned int omhtbucket; struct _aggr_computes *compute; struct _aggr_invocations *invocation; struct _aggr_bucket *bucket; struct _aggr_bucket_entries *entries; /* get value */ if ((v = strchr(firstspace + 1, ' ')) == NULL) { /* metric includes \n */ if (mode == DEBUG || mode == DEBUGTEST) logerr("aggregator: dropping incorrect metric: %s", metric); return; } s->received++; val = atof(firstspace + 1); epoch = atoll(v + 1); pthread_mutex_lock(&s->bucketlock); for (compute = s->computes; compute != NULL; compute = compute->next) { if (nmatch == 0) { ometric = compute->metric; } else if ((len = router_rewrite_metric( &newmetric, &newfirstspace, metric, firstspace, compute->metric, nmatch, pmatch)) == 0) { /* fail, skip */ continue; } else { *newfirstspace = '\0'; ometric = newmetric; } omhash = FNV1A_32_OFFSET; for (omp = ometric; *omp != '\0'; omp++) omhash = (omhash ^ (unsigned int)*omp) * FNV1A_32_PRIME; omhtbucket = ((omhash >> AGGR_HT_POW_SIZE) ^ omhash) & (((unsigned int)1 << AGGR_HT_POW_SIZE) - 1); invocation = compute->invocations_ht[omhtbucket]; for (; invocation != NULL; invocation = invocation->next) if (invocation->hash == omhash && strcmp(ometric, invocation->metric) == 0) /* match */ break; if (invocation == NULL) { /* no match, add */ int i; time_t now; if ((invocation = malloc(sizeof(*invocation))) == NULL) { logerr("aggregator: out of memory creating %s from %s", ometric, metric); continue; } if ((invocation->metric = strdup(ometric)) == NULL) { logerr("aggregator: out of memory creating %s from %s", ometric, metric); free(invocation); continue; } invocation->hash = omhash; /* Start buckets in the past such that expiry time * conditions are met. Add a splay to the expiry time to * avoid a thundering herd of expirations when the * aggregator is spammed with metrics, e.g. right after * startup when other relays flush their queues. This * approach shouldn't affect the timing of the buckets as * requested in issue #72. * For consistency with other tools/old carbon-aggregator * align the buckets to interval boundaries such that it is * predictable what intervals will be taken, issue #104. */ time(&now); now = ((now - s->expire) / s->interval) * s->interval; invocation->expire = s->expire + (rand() % s->interval); /* allocate enough buckets to hold the past + future */ invocation->buckets = malloc(sizeof(struct _aggr_bucket) * s->bucketcnt); if (invocation->buckets == NULL) { logerr("aggregator: out of memory creating %s from %s", ometric, metric); free(invocation->metric); free(invocation); continue; } for (i = 0; i < s->bucketcnt; i++) { invocation->buckets[i].start = now + (i * s->interval); invocation->buckets[i].cnt = 0; invocation->buckets[i].entries.size = 0; invocation->buckets[i].entries.values = NULL; } invocation->next = compute->invocations_ht[omhtbucket]; compute->invocations_ht[omhtbucket] = invocation; } /* finally, try to do the maths */ itime = epoch - invocation->buckets[0].start; if (itime < 0) { /* drop too old metric */ s->dropped++; continue; } slot = itime / s->interval; if (slot >= s->bucketcnt) { if (mode == DEBUG || mode == DEBUGTEST) logerr("aggregator: dropping metric too far in the " "future (%lld > %lld): %s from %s", epoch, invocation->buckets[s->bucketcnt - 1].start, ometric, metric); s->dropped++; continue; } bucket = &invocation->buckets[slot]; if (bucket->cnt == 0) { bucket->sum = val; bucket->max = val; bucket->min = val; } else { bucket->sum += val; if (bucket->max < val) bucket->max = val; if (bucket->min > val) bucket->min = val; } entries = &bucket->entries; if (compute->entries_needed) { if (bucket->cnt == entries->size) { #define E_I_SZ 64 double *new = realloc(entries->values, sizeof(double) * (entries->size + E_I_SZ)); if (new == NULL) { logerr("aggregator: out of memory creating entry bucket " "(%s from %s)", ometric, metric); } else { entries->values = new; entries->size += E_I_SZ; } } if (bucket->cnt < entries->size) entries->values[bucket->cnt] = val; } bucket->cnt++; } pthread_mutex_unlock(&s->bucketlock); return; } static inline int cmp_entry(const void *l, const void *r) { return *(const double *)l - *(const double *)r; } /** * Checks if the oldest bucket should be expired, if so, sends out * computed aggregate metrics and moves the bucket to the end of the * list. When no buckets are in use for an invocation, it is removed to * cleanup resources. */ static void * aggregator_expire(void *sub) { aggregator *aggrs = (aggregator *)sub; time_t now; aggregator *s; struct _aggr_bucket *b; struct _aggr_computes *c; struct _aggr_invocations *inv; struct _aggr_invocations *lastinv; double *values; size_t len = 0; int i; unsigned char j; int work; char metric[METRIC_BUFSIZ]; char isempty; long long int ts = 0; while (1) { work = 0; for (s = aggrs; s != NULL; s = s->next) { /* send metrics for buckets that are completely past the * expiry time, unless we are shutting down, then send * metrics for all buckets that have completed */ now = time(NULL) + (keep_running ? 0 : s->expire - s->interval); for (c = s->computes; c != NULL; c = c->next) { for (i = 0; i < (1 << AGGR_HT_POW_SIZE); i++) { lastinv = NULL; isempty = 0; for (inv = c->invocations_ht[i]; inv != NULL; ) { while (inv->buckets[0].start + (keep_running ? inv->expire : s->expire) < now) { /* yay, let's produce something cool */ b = &inv->buckets[0]; /* avoid emitting empty/unitialised data */ isempty = b->cnt == 0; if (!isempty) { switch (s->tswhen) { case TS_START: ts = b->start; break; case TS_MIDDLE: ts = b->start + (s->interval / 2); break; case TS_END: ts = b->start + s->interval; break; default: assert(0); } switch (c->type) { case SUM: len = snprintf(metric, sizeof(metric), "%s %f %lld\n", inv->metric, b->sum, ts); break; case CNT: len = snprintf(metric, sizeof(metric), "%s %zu %lld\n", inv->metric, b->cnt, ts); break; case MAX: len = snprintf(metric, sizeof(metric), "%s %f %lld\n", inv->metric, b->max, ts); break; case MIN: len = snprintf(metric, sizeof(metric), "%s %f %lld\n", inv->metric, b->min, ts); break; case AVG: len = snprintf(metric, sizeof(metric), "%s %f %lld\n", inv->metric, b->sum / (double)b->cnt, ts); break; case MEDN: /* median == 50th percentile */ case PCTL: { /* nearest rank method */ size_t n = (int)(((double)c->percentile/100.0 * (double)b->cnt) + 0.9); values = b->entries.values; /* TODO: lazy approach, in case * of 1 (first/last) or 2 buckets * distance we could do a * forward run picking the max * entries and returning that * iso sorting the full array */ qsort(values, b->cnt, sizeof(double), cmp_entry); len = snprintf(metric, sizeof(metric), "%s %f %lld\n", inv->metric, values[n - 1], ts); } break; case VAR: case SDEV: { double avg = b->sum / (double)b->cnt; double ksum = 0; values = b->entries.values; for (i = 0; i < b->cnt; i++) ksum += pow(values[i] - avg, 2); ksum /= (double)b->cnt; len = snprintf(metric, sizeof(metric), "%s %f %lld\n", inv->metric, c->type == VAR ? ksum : sqrt(ksum), ts); } break; default: assert(0); /* for compiler (len) */ } if (write(s->fd, metric, len) != len) { s->dropped++; } else { s->sent++; } } /* move the bucket to the end, to make room for * new ones */ pthread_mutex_lock(&s->bucketlock); b = &inv->buckets[0]; len = b->entries.size; values = b->entries.values; memmove(b, &inv->buckets[1], sizeof(*b) * (s->bucketcnt - 1)); b = &inv->buckets[s->bucketcnt - 1]; b->cnt = 0; b->start = inv->buckets[s->bucketcnt - 2].start + s->interval; b->entries.size = len; b->entries.values = values; pthread_mutex_unlock(&s->bucketlock); work++; } if (isempty) { /* see if the remaining buckets are empty too */ pthread_mutex_lock(&s->bucketlock); for (j = 0; j < s->bucketcnt; j++) { if (inv->buckets[j].cnt != 0) { isempty = 0; pthread_mutex_unlock(&s->bucketlock); break; } } } if (isempty) { /* free and unlink */ if (c->entries_needed) for (j = 0; j < s->bucketcnt; j++) if (inv->buckets[j].entries.values) free(inv->buckets[j].entries.values); free(inv->metric); free(inv->buckets); if (lastinv != NULL) { lastinv->next = inv->next; free(inv); inv = lastinv->next; } else { c->invocations_ht[i] = inv->next; free(inv); inv = c->invocations_ht[i]; } pthread_mutex_unlock(&s->bucketlock); } else { lastinv = inv; inv = inv->next; } } } } } if (work == 0) { if (!keep_running) break; /* nothing done, avoid spinlocking */ usleep(250 * 1000); /* 250ms */ } } /* free up value buckets */ while ((s = aggrs) != NULL) { while (s->computes != NULL) { c = s->computes; free((void *)c->metric); for (i = 0; i < 1 << AGGR_HT_POW_SIZE; i++) { inv = c->invocations_ht[i]; while (inv != NULL) { struct _aggr_invocations *invocation = inv; free(inv->metric); if (c->entries_needed) for (j = 0; j < s->bucketcnt; j++) if (inv->buckets[j].entries.values) free(inv->buckets[j].entries.values); free(inv->buckets); inv = invocation->next; free(invocation); } } s->computes = c->next; free(c); } aggrs = aggrs->next; free(s); } return NULL; } /** * Returns the number of aggregators defined. */ size_t aggregator_numaggregators(aggregator *aggrs) { size_t totaggregators = 0; aggregator *a; for (a = aggrs; a != NULL; a = a->next) totaggregators++; return totaggregators; } /** * Returns the total number of computations defined. */ size_t aggregator_numcomputes(aggregator *aggrs) { size_t totcomputes = 0; aggregator *a; struct _aggr_computes *c; for (a = aggrs; a != NULL; a = a->next) for (c = a->computes; c != NULL; c = c->next) totcomputes++; return totcomputes; } /** * Initialises and starts the aggregator. Returns false when starting * failed, true otherwise. */ int aggregator_start(aggregator *aggrs) { keep_running = 1; if (pthread_create(&aggregatorid, NULL, aggregator_expire, aggrs) != 0) return 0; return 1; } /** * Shuts down the aggregator. */ void aggregator_stop(void) { keep_running = 0; pthread_join(aggregatorid, NULL); } /** * Returns an approximate number of received metrics by all aggregators. */ size_t aggregator_get_received(aggregator *a) { size_t totreceived = 0; for ( ; a != NULL; a = a->next) totreceived += a->received; return totreceived; } /** * Returns an approximate number of metrics received by all aggregators * since the last call to this function. */ inline size_t aggregator_get_received_sub(aggregator *aggrs) { size_t d = aggregator_get_received(aggrs); size_t r = d - prevreceived; prevreceived += d; return r; } /** * Returns an approximate number of metrics sent by all aggregators. */ size_t aggregator_get_sent(aggregator *a) { size_t totsent = 0; for ( ; a != NULL; a = a->next) totsent += a->sent; return totsent; } /** * Returns an approximate number of metrics sent by all aggregators * since the last call to this function. */ inline size_t aggregator_get_sent_sub(aggregator *aggrs) { size_t d = aggregator_get_sent(aggrs); size_t r = d - prevsent; prevsent += d; return r; } /** * Returns an approximate number of dropped metrics by all aggregators. * Metrics are dropped if they are too much in the past (past expiry * time) or if they are too much in the future. */ size_t aggregator_get_dropped(aggregator *a) { size_t totdropped = 0; for ( ; a != NULL; a = a->next) totdropped += a->dropped; return totdropped; } /** * Returns an approximate number of metrics dropped by all aggregators * since the last call to this function. */ inline size_t aggregator_get_dropped_sub(aggregator *aggrs) { size_t d = aggregator_get_dropped(aggrs); size_t r = d - prevdropped; prevdropped += d; return r; } carbon-c-relay-1.7/aggregator.h000066400000000000000000000053731265266732300164760ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef AGGREGATOR_H #define AGGREGATOR_H 1 #include #include #include "server.h" #define AGGR_HT_POW_SIZE 12 /* 4096: too big? issue #60 */ typedef struct _aggregator { unsigned short interval; /* when to perform the aggregation */ unsigned short expire; /* when incoming metrics are no longer valid */ enum _aggr_timestamp { TS_START, TS_MIDDLE, TS_END } tswhen; unsigned char bucketcnt; int disp_conn; int fd; size_t received; size_t sent; size_t dropped; struct _aggr_computes { enum _aggr_compute_type { SUM, CNT, MAX, MIN, AVG, MEDN, PCTL, VAR, SDEV } type; const char *metric; /* name template of metric to produce */ struct _aggr_invocations { char *metric; /* actual name to emit */ unsigned int hash; /* to speed up matching */ unsigned short expire; /* expire + splay */ struct _aggr_bucket { long long int start; size_t cnt; double sum; double max; double min; struct _aggr_bucket_entries { size_t size; double *values; } entries; } *buckets; struct _aggr_invocations *next; } *invocations_ht[1 << AGGR_HT_POW_SIZE]; unsigned char entries_needed:1; unsigned char percentile:7; struct _aggr_computes *next; } *computes; pthread_mutex_t bucketlock; struct _aggregator *next; } aggregator; aggregator *aggregator_new(unsigned int interval, unsigned int expire, enum _aggr_timestamp tswhen); char aggregator_add_compute(aggregator *s, const char *metric, const char *type); void aggregator_set_stub(aggregator *s, const char *stubname); void aggregator_putmetric(aggregator *s, const char *metric, const char *firstspace, size_t nmatch, regmatch_t *pmatch); int aggregator_start(aggregator *aggrs); void aggregator_stop(void); size_t aggregator_numaggregators(aggregator *agrs); size_t aggregator_numcomputes(aggregator *aggrs); size_t aggregator_get_received(aggregator *aggrs); size_t aggregator_get_sent(aggregator *aggrs); size_t aggregator_get_dropped(aggregator *aggrs); size_t aggregator_get_received_sub(aggregator *aggrs); size_t aggregator_get_sent_sub(aggregator *aggrs); size_t aggregator_get_dropped_sub(aggregator *aggrs); #endif carbon-c-relay-1.7/collector.c000066400000000000000000000307601265266732300163330ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include "relay.h" #include "dispatcher.h" #include "server.h" #include "aggregator.h" #include "collector.h" static dispatcher **dispatchers; static char debug = 0; static pthread_t collectorid; static char keep_running = 1; int collector_interval = 60; static char cluster_refresh_pending = 0; static cluster *pending_clusters = NULL; static aggregator *pending_aggrs = NULL; /** * Collects metrics from dispatchers and servers and emits them. */ static void * collector_runner(void *s) { int i; size_t totticks; size_t totmetrics; size_t totblackholes; size_t totqueued; size_t totstalls; size_t totdropped; size_t ticks; size_t metrics; size_t blackholes; size_t queued; size_t stalls; size_t dropped; size_t dispatchers_idle; size_t dispatchers_busy; time_t now; time_t nextcycle; char ipbuf[32]; char *p; size_t numaggregators = 0; aggregator *aggrs = NULL; server *submission = (server *)s; server **srvs = NULL; char metric[METRIC_BUFSIZ]; char *m; size_t sizem = 0; size_t (*s_ticks)(server *); size_t (*s_metrics)(server *); size_t (*s_stalls)(server *); size_t (*s_dropped)(server *); size_t (*d_ticks)(dispatcher *); size_t (*d_metrics)(dispatcher *); size_t (*d_blackholes)(dispatcher *); size_t (*a_received)(aggregator *); size_t (*a_sent)(aggregator *); size_t (*a_dropped)(aggregator *); /* prepare hostname for graphite metrics */ snprintf(metric, sizeof(metric), "carbon.relays.%s", relay_hostname); for (m = metric + strlen("carbon.relays."); *m != '\0'; m++) if (*m == '.') *m = '_'; *m++ = '.'; *m = '\0'; sizem = sizeof(metric) - (m - metric); /* setup functions to target what the user wants */ if (debug & 2) { s_ticks = server_get_ticks_sub; s_metrics = server_get_metrics_sub; s_stalls = server_get_stalls_sub; s_dropped = server_get_dropped_sub; d_ticks = dispatch_get_ticks_sub; d_metrics = dispatch_get_metrics_sub; d_blackholes = dispatch_get_blackholes_sub; a_received = aggregator_get_received_sub; a_sent = aggregator_get_sent_sub; a_dropped = aggregator_get_dropped_sub; } else { s_ticks = server_get_ticks; s_metrics = server_get_metrics; s_stalls = server_get_stalls; s_dropped = server_get_dropped; d_ticks = dispatch_get_ticks; d_metrics = dispatch_get_metrics; d_blackholes = dispatch_get_blackholes; a_received = aggregator_get_received; a_sent = aggregator_get_sent; a_dropped = aggregator_get_dropped; } #define send(metric) \ if (debug & 1) \ logout("%s", metric); \ else \ server_send(submission, strdup(metric), 1); nextcycle = time(NULL) + collector_interval; while (keep_running) { if (cluster_refresh_pending) { server **newservers = router_getservers(pending_clusters); if (srvs != NULL) free(srvs); srvs = newservers; aggrs = pending_aggrs; numaggregators = aggregator_numaggregators(aggrs); cluster_refresh_pending = 0; } assert(srvs != NULL); sleep(1); now = time(NULL); if (nextcycle > now) continue; nextcycle += collector_interval; totticks = 0; totmetrics = 0; totblackholes = 0; dispatchers_idle = 0; dispatchers_busy = 0; for (i = 0; dispatchers[i] != NULL; i++) { if (dispatch_busy(dispatchers[i])) { dispatchers_busy++; } else { dispatchers_idle++; } totticks += ticks = d_ticks(dispatchers[i]); totmetrics += metrics = d_metrics(dispatchers[i]); totblackholes += blackholes = d_blackholes(dispatchers[i]); snprintf(m, sizem, "dispatcher%d.metricsReceived %zu %zu\n", i + 1, metrics, (size_t)now); send(metric); snprintf(m, sizem, "dispatcher%d.metricsBlackholed %zu %zu\n", i + 1, blackholes, (size_t)now); send(metric); snprintf(m, sizem, "dispatcher%d.wallTime_us %zu %zu\n", i + 1, ticks, (size_t)now); send(metric); } snprintf(m, sizem, "metricsReceived %zu %zu\n", totmetrics, (size_t)now); send(metric); snprintf(m, sizem, "metricsBlackholed %zu %zu\n", totblackholes, (size_t)now); send(metric); snprintf(m, sizem, "dispatch_wallTime_us %zu %zu\n", totticks, (size_t)now); send(metric); snprintf(m, sizem, "dispatch_busy %zu %zu\n", dispatchers_busy, (size_t)now); send(metric); snprintf(m, sizem, "dispatch_idle %zu %zu\n", dispatchers_idle, (size_t)now); send(metric); #define send_server_metrics(ipbuf, ticks, metrics, queued, stalls, dropped) \ snprintf(m, sizem, "destinations.%s.sent %zu %zu\n", \ ipbuf, metrics, (size_t)now); \ send(metric); \ snprintf(m, sizem, "destinations.%s.queued %zu %zu\n", \ ipbuf, queued, (size_t)now); \ send(metric); \ snprintf(m, sizem, "destinations.%s.stalls %zu %zu\n", \ ipbuf, stalls, (size_t)now); \ send(metric); \ snprintf(m, sizem, "destinations.%s.dropped %zu %zu\n", \ ipbuf, dropped, (size_t)now); \ send(metric); \ snprintf(m, sizem, "destinations.%s.wallTime_us %zu %zu\n", \ ipbuf, ticks, (size_t)now); \ send(metric); totticks = 0; totmetrics = 0; totqueued = 0; totstalls = 0; totdropped = 0; /* exclude internal_submission metrics from the totals to avoid * artificial doubles due to internal routing details */ strncpy(ipbuf, "internal", sizeof(ipbuf)); ticks = s_ticks(submission); metrics = s_metrics(submission); queued = server_get_queue_len(submission); stalls = s_stalls(submission); dropped = s_dropped(submission); send_server_metrics(ipbuf, ticks, metrics, queued, stalls, dropped); for (i = 0; srvs[i] != NULL; i++) { if (server_ctype(srvs[i]) == CON_FILE) { strncpy(ipbuf, server_ip(srvs[i]), sizeof(ipbuf)); } else { snprintf(ipbuf, sizeof(ipbuf), "%s:%u", server_ip(srvs[i]), server_port(srvs[i])); } for (p = ipbuf; *p != '\0'; p++) if (*p == '.') *p = '_'; totticks += ticks = s_ticks(srvs[i]); totmetrics += metrics = s_metrics(srvs[i]); totqueued += queued = server_get_queue_len(srvs[i]); totstalls += stalls = s_stalls(srvs[i]); totdropped += dropped = s_dropped(srvs[i]); send_server_metrics(ipbuf, ticks, metrics, queued, stalls, dropped); } snprintf(m, sizem, "metricsSent %zu %zu\n", totmetrics, (size_t)now); send(metric); snprintf(m, sizem, "metricsQueued %zu %zu\n", totqueued, (size_t)now); send(metric); snprintf(m, sizem, "metricStalls %zu %zu\n", totstalls, (size_t)now); send(metric); snprintf(m, sizem, "metricsDropped %zu %zu\n", totdropped, (size_t)now); send(metric); snprintf(m, sizem, "server_wallTime_us %zu %zu\n", totticks, (size_t)now); send(metric); snprintf(m, sizem, "connections %zu %zu\n", dispatch_get_accepted_connections(), (size_t)now); send(metric); snprintf(m, sizem, "disconnects %zu %zu\n", dispatch_get_closed_connections(), (size_t)now); send(metric); if (numaggregators > 0) { snprintf(m, sizem, "aggregators.metricsReceived %zu %zu\n", a_received(aggrs), (size_t)now); send(metric); snprintf(m, sizem, "aggregators.metricsSent %zu %zu\n", a_sent(aggrs), (size_t)now); send(metric); snprintf(m, sizem, "aggregators.metricsDropped %zu %zu\n", a_dropped(aggrs), (size_t)now); send(metric); } if (debug & 1) fflush(stdout); } if (srvs != NULL) free(srvs); return NULL; } /** * Writes messages about dropped events or high queue sizes. */ static size_t lastdropped = 0; static size_t lastaggrdropped = 0; static void * collector_writer(void *unused) { int i = 0; size_t queued; size_t queuesize; double queueusage; size_t totdropped; size_t lastconn = 0; size_t lastdisc = 0; size_t numaggregators = 0; server **srvs = NULL; aggregator *aggrs = NULL; while (keep_running) { if (cluster_refresh_pending) { server **newservers = router_getservers(pending_clusters); if (srvs != NULL) free(srvs); srvs = newservers; aggrs = pending_aggrs; numaggregators = aggregator_numaggregators(aggrs); cluster_refresh_pending = 0; } assert(srvs != NULL); sleep(1); if (debug & 1) { size_t mpsout; size_t totout; size_t mpsdrop; size_t totdrop; size_t totqueue; size_t mpsin; size_t totin; size_t totconn; size_t totdisc; short widle; short wbusy; int j; /* Solaris iostat like output: metrics in metrics out metrics drop queue conns disconn workr mps tot mps tot dps tot cur cps tot dps tot id bs 99999 9999999 99999 9999999 99999 9999999 99999 999 99999 999 99999 99 99 */ if (i % 24 == 0) printf(" metrics in metrics out metrics drop queue conns disconn workr\n" " mps tot mps tot dps tot cur cps tot dps tot id bs\n"); mpsout = totout = 0; mpsdrop = totdrop = 0; totqueue = 0; for (j = 0; srvs[j] != NULL; j++) { mpsout += server_get_metrics_sub(srvs[j]); totout += server_get_metrics(srvs[j]); mpsdrop += server_get_dropped_sub(srvs[j]); totdrop += server_get_dropped(srvs[j]); totqueue += server_get_queue_len(srvs[j]); } mpsin = totin = 0; widle = wbusy = 0; for (j = 0; dispatchers[j] != NULL; j++) { mpsin += dispatch_get_metrics_sub(dispatchers[j]); totin += dispatch_get_metrics(dispatchers[j]); if (dispatch_busy(dispatchers[j])) { wbusy++; } else { widle++; } } totconn = dispatch_get_accepted_connections(); totdisc = dispatch_get_closed_connections(); printf("%5zu %7zu " /* metrics in */ "%5zu %7zu " /* metrics out */ "%5zu %7zu " /* metrics dropped */ "%5zu " /* queue */ "%3zu %5zu " /* conns */ "%3zu %5zu " /* disconns */ "%2d %2d\n", /* workers */ mpsin, totin, mpsout, totout, mpsdrop, totdrop, totqueue, totconn - lastconn, totconn, totdisc - lastdisc, totdisc, widle, wbusy ); lastconn = totconn; lastdisc = totdisc; } i++; if (i < collector_interval) continue; totdropped = 0; for (i = 0; srvs[i] != NULL; i++) { queued = server_get_queue_len(srvs[i]); queuesize = server_get_queue_size(srvs[i]); totdropped += server_get_dropped(srvs[i]); queueusage = (double)queued / (double)queuesize; if (queueusage >= 0.75) logout("warning: metrics queuing up " "for %s:%u: %zu metrics (%d%% of queue size)\n", server_ip(srvs[i]), server_port(srvs[i]), queued, (int)(queueusage * 100)); } if (totdropped - lastdropped > 0) logout("warning: dropped %zu metrics\n", totdropped - lastdropped); lastdropped = totdropped; if (numaggregators > 0) { totdropped = aggregator_get_dropped(aggrs); if (totdropped - lastaggrdropped > 0) logout("warning: aggregator dropped %zu metrics\n", totdropped - lastaggrdropped); lastaggrdropped = totdropped; } i = 0; } return NULL; } /** * Schedules routes r to be put in place for the current routes. The * replacement is performed at the next cycle of the collector. */ inline void collector_schedulereload(cluster *c, aggregator *a) { pending_clusters = c; pending_aggrs = a; cluster_refresh_pending = 1; } /** * Returns true if the routes scheduled to be reloaded by a call to * collector_schedulereload() have been activated. */ inline char collector_reloadcomplete(void) { return cluster_refresh_pending == 0; } /** * Initialises and starts the collector. */ void collector_start(dispatcher **d, cluster *c, aggregator *a, server *submission, char cum) { dispatchers = d; collector_schedulereload(c, a); if (mode == DEBUG || mode == DEBUGTEST || mode == DEBUGSUBMISSION) debug = 1; debug |= (cum ? 0 : 2); if (mode != SUBMISSION && mode != DEBUGSUBMISSION) { if (pthread_create(&collectorid, NULL, collector_runner, submission) != 0) logerr("failed to start collector!\n"); } else { if (pthread_create(&collectorid, NULL, collector_writer, NULL) != 0) logerr("failed to start collector!\n"); } } /** * Shuts down the collector. */ void collector_stop(void) { keep_running = 0; pthread_join(collectorid, NULL); } carbon-c-relay-1.7/collector.h000066400000000000000000000022001265266732300163240ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef COLLECTOR_H #define COLLECTOR_H 1 #include "dispatcher.h" #include "router.h" #include "aggregator.h" #include "server.h" #include "relay.h" extern int collector_interval; #define timediff(X, Y) \ (Y.tv_sec > X.tv_sec ? (Y.tv_sec - X.tv_sec) * 1000 * 1000 + ((Y.tv_usec - X.tv_usec)) : Y.tv_usec - X.tv_usec) void collector_start(dispatcher **d, cluster *c, aggregator *a, server *submission, char cum); void collector_stop(void); void collector_schedulereload(cluster *c, aggregator *a); char collector_reloadcomplete(void); #endif carbon-c-relay-1.7/consistent-hash.c000066400000000000000000000302031265266732300174470ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include "fnv1a.h" #include "md5.h" #include "server.h" #define CH_RING struct _ch_ring #include "consistent-hash.h" /* This value is hardwired in the carbon sources, and necessary to get * fair (re)balancing of metrics in the hash ring. Because the value * seems reasonable, we use the same value for carbon and fnv1a hash * implementations. */ #define HASH_REPLICAS 100 typedef struct _ring_entry { unsigned short pos; unsigned char malloced:1; server *server; struct _ring_entry *next; } ch_ring_entry; struct _ch_ring { ch_type type; unsigned char hash_replicas; ch_ring_entry *entries; ch_ring_entry **entrylist; /* only used with jump hash */ int entrycnt; }; /** * Computes the hash position for key in a 16-bit unsigned integer * space. Returns a number between 0 and 65535 based on the highest 2 * bytes of the MD5 sum of key. */ static unsigned short carbon_hashpos(const char *key, const char *end) { unsigned char md5[MD5_DIGEST_LENGTH]; MD5((unsigned char *)key, end - key, md5); return ((md5[0] << 8) + md5[1]); } /** * Computes the hash position for key in a 16-bit unsigned integer * space. Returns a number between 0 and 65535 based on the FNV1a hash * algorithm. */ static unsigned short fnv1a_hashpos(const char *key, const char *end) { unsigned int hash; fnv1a_32(hash, key, key, end); return (unsigned short)((hash >> 16) ^ (hash & (unsigned int)0xFFFF)); } /** * Computes the bucket number for key in the range [0, bckcnt). The * algorithm used is the jump consistent hash by Lamping and Veach. */ static unsigned int jump_bucketpos(unsigned long long int key, int bckcnt) { long long int b = -1, j = 0; while (j < bckcnt) { b = j; key = key * 2862933555777941757ULL + 1; j = (long long int)((double)(b + 1) * ((double)(1LL << 31) / (double)((key >> 33) + 1)) ); } /* b cannot exceed the range of bckcnt, see while condition */ return (int)b; } /** * Sort comparator for ch_ring_entry structs on pos, ip and instance. */ static int entrycmp_carbon(const void *l, const void *r) { ch_ring_entry *ch_l = (ch_ring_entry *)l; ch_ring_entry *ch_r = (ch_ring_entry *)r; if (ch_l->pos != ch_r->pos) return ch_l->pos - ch_r->pos; #ifndef CH_CMP_V40_BEHAVIOUR { int d = strcmp(server_ip(ch_l->server), server_ip(ch_r->server)); char *i_l, *i_r; if (d != 0) return d; i_l = server_instance(ch_l->server); i_r = server_instance(ch_r->server); if (i_l == NULL && i_r == NULL) return 0; if (i_l == NULL) return 1; if (i_r == NULL) return -1; return strcmp(i_l, i_r); } #endif return 0; } /** * Sort comparator for ch_ring_entry structs on pos, ip and port. */ static int entrycmp_fnv1a(const void *l, const void *r) { ch_ring_entry *ch_l = (ch_ring_entry *)l; ch_ring_entry *ch_r = (ch_ring_entry *)r; if (ch_l->pos != ch_r->pos) return ch_l->pos - ch_r->pos; #ifndef CH_CMP_V40_BEHAVIOUR { int d = strcmp(server_ip(ch_l->server), server_ip(ch_r->server)); if (d != 0) return d; return server_port(ch_l->server) - server_port(ch_r->server); } #endif return 0; } /** * Sort comparator for ch_ring_entry structs on instance only. */ static int entrycmp_jump_fnv1a(const void *l, const void *r) { char *si_l = server_instance(((ch_ring_entry *)l)->server); char *si_r = server_instance(((ch_ring_entry *)r)->server); return strcmp(si_l ? si_l : "", si_r ? si_r : ""); } ch_ring * ch_new(ch_type type) { ch_ring *ret = malloc(sizeof(ch_ring)); if (ret == NULL) return NULL; ret->type = type; switch (ret->type) { case CARBON: case FNV1a: ret->hash_replicas = HASH_REPLICAS; break; default: ret->hash_replicas = 1; break; } ret->entries = NULL; ret->entrylist = NULL; ret->entrycnt = 0; return ret; } /** * Computes the hash positions for the server name given. This is based * on the hashpos function. The server name usually is the IPv4 * address. The port component is just stored and not used in the * carbon hash calculation in case of carbon_ch. The instance component * is used in the hash calculation of carbon_ch, it is ignored for * fnv1a_ch. Returns an updated ring. */ ch_ring * ch_addnode(ch_ring *ring, server *s) { int i; char buf[256]; ch_ring_entry *entries; char *instance = server_instance(s); int (*cmp)(const void *, const void *) = NULL; if (ring == NULL) return NULL; entries = (ch_ring_entry *)malloc(sizeof(ch_ring_entry) * ring->hash_replicas); if (entries == NULL) return NULL; switch (ring->type) { case CARBON: for (i = 0; i < ring->hash_replicas; i++) { /* this format is actually Python's tuple format that is * used in serialised form as input for the hash */ snprintf(buf, sizeof(buf), "('%s', %s%s%s):%d", server_ip(s), instance == NULL ? "" : "'", instance == NULL ? "None" : instance, instance == NULL ? "" : "'", i); /* carbon upstream committed: * https://github.com/graphite-project/carbon/commit/024f9e67ca47619438951c59154c0dec0b0518c7 * this makes sure no collissions exist on pos, however, * at the expense of being agnostic to the input order, * therefore that change isn't implemented here, see * https://github.com/grobian/carbon-c-relay/issues/84 */ entries[i].pos = carbon_hashpos(buf, buf + strlen(buf)); entries[i].server = s; entries[i].next = NULL; entries[i].malloced = 0; } cmp = *entrycmp_carbon; break; case FNV1a: for (i = 0; i < ring->hash_replicas; i++) { /* take all server info into account, such that * different port numbers for the same hosts will work * (unlike CARBON), unless we got a full overrride */ if (instance == NULL) { snprintf(buf, sizeof(buf), "%d-%s:%u", i, server_ip(s), server_port(s)); } else { snprintf(buf, sizeof(buf), "%d-%s", i, instance); } entries[i].pos = fnv1a_hashpos(buf, buf + strlen(buf)); entries[i].server = s; entries[i].next = NULL; entries[i].malloced = 0; } cmp = *entrycmp_fnv1a; break; case JUMP_FNV1a: entries[0].pos = 0; entries[0].server = s; entries[0].next = NULL; entries[0].malloced = 0; cmp = *entrycmp_jump_fnv1a; break; } /* sort to allow merge joins later down the road */ qsort(entries, ring->hash_replicas, sizeof(ch_ring_entry), cmp); entries[0].malloced = 1; if (ring->entries == NULL) { for (i = 1; i < ring->hash_replicas; i++) entries[i - 1].next = &entries[i]; ring->entries = entries; } else { /* merge-join the two rings */ ch_ring_entry *w, *last; i = 0; last = NULL; assert(ring->hash_replicas > 0); for (w = ring->entries; w != NULL && i < ring->hash_replicas; ) { if (cmp(w, &entries[i]) <= 0) { last = w; w = w->next; } else { entries[i].next = w; if (last == NULL) { ring->entries = &entries[i]; } else { last->next = &entries[i]; } last = &entries[i]; i++; } } if (w != NULL) { last->next = w; } else { last->next = &entries[i]; for (i = i + 1; i < ring->hash_replicas; i++) entries[i - 1].next = &entries[i]; } } if (ring->type == JUMP_FNV1a) { ch_ring_entry *w; /* count the ring, pos is purely cosmetic, it isn't used */ for (w = ring->entries, i = 0; w != NULL; w = w->next, i++) w->pos = i; ring->entrycnt = i; /* this is really wasteful, but optimising this isn't worth it * since it's called only a few times during config parsing */ if (ring->entrylist != NULL) free(ring->entrylist); ring->entrylist = malloc(sizeof(ch_ring_entry *) * ring->entrycnt); for (w = ring->entries, i = 0; w != NULL; w = w->next, i++) ring->entrylist[i] = w; if (i == CONN_DESTS_SIZE) { logerr("ch_addnode: nodes in use exceeds CONN_DESTS_SIZE, " "increase CONN_DESTS_SIZE in router.h\n"); return NULL; } } return ring; } /** * Retrieve the nodes responsible for storing the given metric. The * replcnt argument specifies how many hosts should be retrieved. * Results are stored in ret, an array of ch_ring pointers. The * caller is responsible for ensuring that ret is large enough to store * replcnt pointers. */ void ch_get_nodes( destination ret[], ch_ring *ring, const char replcnt, const char *metric, const char *firstspace) { ch_ring_entry *w; unsigned short pos = 0; int i, j; switch (ring->type) { case CARBON: pos = carbon_hashpos(metric, firstspace); break; case FNV1a: pos = fnv1a_hashpos(metric, firstspace); break; case JUMP_FNV1a: { /* this is really a short route, since the jump hash gives * us a bucket immediately */ unsigned long long int hash; ch_ring_entry *bcklst[CONN_DESTS_SIZE]; const char *p; i = ring->entrycnt; pos = replcnt; memcpy(bcklst, ring->entrylist, sizeof(bcklst[0]) * i); fnv1a_64(hash, p, metric, firstspace); while (i > 0) { j = jump_bucketpos(hash, i); (*ret).dest = bcklst[j]->server; (*ret).metric = strdup(metric); ret++; if (--pos == 0) break; /* use xorshift to generate a different hash for input * in the hump hash again */ hash ^= hash >> 12; hash ^= hash << 25; hash ^= hash >> 27; hash *= 2685821657736338717ULL; /* remove the server we just selected, such that we can * be sure the next iteration will fetch another server */ bcklst[j] = bcklst[--i]; } } return; } assert(ring->entries); /* implement behaviour of Python's bisect_left on the ring (used in * carbon hash source), one day we might want to implement it as * real binary search iso forward pointer chasing */ for (w = ring->entries, i = 0; w != NULL; i++, w = w->next) if (w->pos >= pos) break; /* now fetch enough unique servers to match the requested count */ for (i = 0; i < replcnt; i++, w = w->next) { if (w == NULL) w = ring->entries; for (j = i - 1; j >= 0; j--) { if (ret[j].dest == w->server) { j = i; break; } } if (j == i) { i--; continue; } ret[i].dest = w->server; ret[i].metric = strdup(metric); } } void ch_printhashring(ch_ring *ring, FILE *f) { ch_ring_entry *w; char column = 0; char srvbuf[21]; for (w = ring->entries; w != NULL; w = w->next) { snprintf(srvbuf, sizeof(srvbuf), "%s:%d%s%s", server_ip(w->server), server_port(w->server), server_instance(w->server) ? "=" : "", server_instance(w->server) ? server_instance(w->server) : ""); fprintf(f, "%5d@%-20s", w->pos, srvbuf); if (column < 2) { fprintf(f, " "); column++; } else { fprintf(f, "\n"); column = 0; } } if (column != 0) fprintf(f, "\n"); } unsigned short ch_gethashpos(ch_ring *ring, const char *key, const char *end) { switch (ring->type) { case CARBON: return carbon_hashpos(key, end); case FNV1a: return fnv1a_hashpos(key, end); case JUMP_FNV1a: { unsigned long long int hash; fnv1a_64(hash, key, key, end); return jump_bucketpos(hash, ring->entrycnt); } default: assert(0); /* this shouldn't happen */ } return 0; /* pacify compiler */ } /** * Frees the ring structure and its added nodes. */ void ch_free(ch_ring *ring) { ch_ring_entry *deletes = NULL; ch_ring_entry *w = NULL; for (; ring->entries != NULL; ring->entries = ring->entries->next) { if (ring->entries->malloced) { server_shutdown(ring->entries->server); free(ring->entries->server); if (deletes == NULL) { w = deletes = ring->entries; } else { w = w->next = ring->entries; } } } assert(w != NULL); w->next = NULL; while (deletes != NULL) { w = deletes->next; free(deletes); deletes = w; } if (ring->entrylist != NULL) free(ring->entrylist); free(ring); } carbon-c-relay-1.7/consistent-hash.h000066400000000000000000000022761265266732300174650ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef CONSISTENT_HASH_H #define CONSISTENT_HASH_H 1 #include #include "server.h" #include "router.h" #ifndef CH_RING #define CH_RING void #endif typedef CH_RING ch_ring; typedef enum { CARBON, FNV1a, JUMP_FNV1a } ch_type; ch_ring *ch_new(ch_type type); ch_ring *ch_addnode(ch_ring *ring, server *s); void ch_get_nodes( destination ret[], ch_ring *ring, const char replcnt, const char *metric, const char *firstspace); void ch_printhashring(ch_ring *ring, FILE *out); unsigned short ch_gethashpos(ch_ring *ring, const char *key, const char *end); void ch_free(ch_ring *ring); #endif carbon-c-relay-1.7/contrib/000077500000000000000000000000001265266732300156335ustar00rootroot00000000000000carbon-c-relay-1.7/contrib/relay.logrotate000066400000000000000000000004341265266732300206720ustar00rootroot00000000000000# not installed by default as logrotate is used to manage all carbon log files. /var/log/%%NAME%%/%%NAME%%.log { sharedscripts missingok notifempty rotate 30 compress postrotate [ ! -f /var/run/%%NAME%%/%%NAME%%.pid ] || /etc/init.d/%%NAME%% restart endscript } carbon-c-relay-1.7/contrib/relay.monit000066400000000000000000000004621265266732300200210ustar00rootroot00000000000000# Monit script to ensure carbon c relay is always running check process %%NAME%% with pidfile /var/run/%%NAME%%/%%NAME%%.pid start program = "/etc/init.d/%%NAME%% start" stop program = "/etc/init.d/%%NAME%% stop" if failed port 2003 type tcp then restart if 5 restarts within 5 cycles then timeout carbon-c-relay-1.7/dispatcher.c000066400000000000000000000466341265266732300165020ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "relay.h" #include "router.h" #include "server.h" #include "collector.h" #include "dispatcher.h" enum conntype { LISTENER, CONNECTION }; typedef struct _connection { int sock; char takenby; /* -2: being setup, -1: free, 0: not taken, >0: tid */ char srcaddr[24]; /* string representation of source address */ char buf[METRIC_BUFSIZ]; int buflen; char needmore:1; char noexpire:1; char metric[METRIC_BUFSIZ]; destination dests[CONN_DESTS_SIZE]; size_t destlen; struct timeval lastwork; char hadwork:1; char isaggr:1; } connection; struct _dispatcher { pthread_t tid; enum conntype type; char id; size_t metrics; size_t blackholes; size_t ticks; size_t prevmetrics; size_t prevblackholes; size_t prevticks; enum { RUNNING, SLEEPING } state; char keep_running:1; route *routes; route *pending_routes; char route_refresh_pending:1; char hold:1; char *allowed_chars; }; static connection *listeners[32]; /* hopefully enough */ static connection *connections = NULL; static size_t connectionslen = 0; pthread_rwlock_t connectionslock = PTHREAD_RWLOCK_INITIALIZER; static size_t acceptedconnections = 0; static size_t closedconnections = 0; /** * Helper function to try and be helpful to the user. If errno * indicates no new fds could be made, checks what the current max open * files limit is, and if it's close to what we have in use now, write * an informative message to stderr. */ void dispatch_check_rlimit_and_warn(void) { if (errno == EISCONN || errno == EMFILE) { struct rlimit ofiles; /* rlimit can be changed for the running process (at least on * Linux 2.6+) so refetch this value every time, should only * occur on errors anyway */ if (getrlimit(RLIMIT_NOFILE, &ofiles) < 0) ofiles.rlim_max = 0; if (ofiles.rlim_max != RLIM_INFINITY && ofiles.rlim_max > 0) logerr("process configured maximum connections = %d, " "consider raising max open files/max descriptor limit\n", (int)ofiles.rlim_max); } } /** * Adds an (initial) listener socket to the chain of connections. * Listener sockets are those which need to be accept()-ed on. */ int dispatch_addlistener(int sock) { connection *newconn; int c; newconn = malloc(sizeof(connection)); if (newconn == NULL) return 1; (void) fcntl(sock, F_SETFL, O_NONBLOCK); newconn->sock = sock; newconn->takenby = 0; newconn->buflen = 0; for (c = 0; c < sizeof(listeners) / sizeof(connection *); c++) if (__sync_bool_compare_and_swap(&(listeners[c]), NULL, newconn)) break; if (c == sizeof(listeners) / sizeof(connection *)) { free(newconn); logerr("cannot add new listener: " "no more free listener slots (max = %zu)\n", sizeof(listeners) / sizeof(connection *)); return 1; } return 0; } void dispatch_removelistener(int sock) { int c; connection *conn; /* find connection */ for (c = 0; c < sizeof(listeners) / sizeof(connection *); c++) if (listeners[c] != NULL && listeners[c]->sock == sock) break; if (c == sizeof(listeners) / sizeof(connection *)) { /* not found?!? */ logerr("dispatch: cannot find listener!\n"); return; } /* make this connection no longer visible */ conn = listeners[c]; listeners[c] = NULL; /* if some other thread was looking at conn, make sure it * will have moved on before freeing this object */ usleep(10 * 1000); /* 10ms */ close(conn->sock); free(conn); } #define CONNGROWSZ 1024 /** * Adds a connection socket to the chain of connections. * Connection sockets are those which need to be read from. * Returns the connection id, or -1 if a failure occurred. */ int dispatch_addconnection(int sock) { size_t c; struct sockaddr_in6 saddr; socklen_t saddr_len = sizeof(saddr); pthread_rwlock_rdlock(&connectionslock); for (c = 0; c < connectionslen; c++) if (__sync_bool_compare_and_swap(&(connections[c].takenby), -1, -2)) break; pthread_rwlock_unlock(&connectionslock); if (c == connectionslen) { connection *newlst; pthread_rwlock_wrlock(&connectionslock); if (connectionslen > c) { /* another dispatcher just extended the list */ pthread_rwlock_unlock(&connectionslock); return dispatch_addconnection(sock); } newlst = realloc(connections, sizeof(connection) * (connectionslen + CONNGROWSZ)); if (newlst == NULL) { logerr("cannot add new connection: " "out of memory allocating more slots (max = %zu)\n", connectionslen); pthread_rwlock_unlock(&connectionslock); return -1; } memset(&newlst[connectionslen], '\0', sizeof(connection) * CONNGROWSZ); for (c = connectionslen; c < connectionslen + CONNGROWSZ; c++) newlst[c].takenby = -1; /* free */ connections = newlst; c = connectionslen; /* for the setup code below */ newlst[c].takenby = -2; connectionslen += CONNGROWSZ; pthread_rwlock_unlock(&connectionslock); } /* figure out who's calling */ if (getpeername(sock, (struct sockaddr *)&saddr, &saddr_len) == 0) { snprintf(connections[c].srcaddr, sizeof(connections[c].srcaddr), "(unknown)"); switch (saddr.sin6_family) { case PF_INET: inet_ntop(saddr.sin6_family, &((struct sockaddr_in *)&saddr)->sin_addr, connections[c].srcaddr, sizeof(connections[c].srcaddr)); break; case PF_INET6: inet_ntop(saddr.sin6_family, &saddr.sin6_addr, connections[c].srcaddr, sizeof(connections[c].srcaddr)); break; } } (void) fcntl(sock, F_SETFL, O_NONBLOCK); connections[c].sock = sock; connections[c].buflen = 0; connections[c].needmore = 0; connections[c].noexpire = 0; connections[c].isaggr = 0; connections[c].destlen = 0; gettimeofday(&connections[c].lastwork, NULL); connections[c].hadwork = 1; /* force first iteration before stalling */ connections[c].takenby = 0; /* now dispatchers will pick this one up */ acceptedconnections++; return c; } /** * Adds a connection which we know is from an aggregator, so direct * pipe. This is different from normal connections that we don't want * to count them, never expire them, and want to recognise them when * we're doing reloads. */ int dispatch_addconnection_aggr(int sock) { int conn = dispatch_addconnection(sock); if (conn == -1) return 1; connections[conn].noexpire = 1; connections[conn].isaggr = 1; acceptedconnections--; return 0; } /** * Adds a pseudo-listener for datagram (UDP) sockets, which is pseudo, * for in fact it adds a new connection, but makes sure that connection * won't be closed after being idle, and won't count that connection as * an incoming connection either. */ int dispatch_addlistener_udp(int sock) { int conn = dispatch_addconnection(sock); if (conn == -1) return 1; connections[conn].noexpire = 1; acceptedconnections--; return 0; } inline static char dispatch_process_dests(connection *conn, dispatcher *self, struct timeval now) { int i; char force = timediff(conn->lastwork, now) > 1 * 1000 * 1000; /* 1 sec timeout */ if (conn->destlen > 0) { for (i = 0; i < conn->destlen; i++) { if (server_send(conn->dests[i].dest, conn->dests[i].metric, force) == 0) break; } if (i != conn->destlen) { conn->destlen -= i; memmove(&conn->dests[0], &conn->dests[i], (sizeof(destination) * conn->destlen)); return 0; } else { /* finally "complete" this metric */ conn->destlen = 0; conn->lastwork = now; conn->hadwork = 1; } } return 1; } #define IDLE_DISCONNECT_TIME (10 * 60 * 1000 * 1000) /* 10 minutes */ /** * Look at conn and see if works needs to be done. If so, do it. This * function operates on an (exclusive) lock on the connection it serves. * Schematically, what this function does is like this: * * read (partial) data <---- * | | * v | * split and clean metrics | * | | * v | * route metrics | feedback loop * | | (stall client) * v | * send 1st attempt | * \ | * v* | * this is optional, but if a server's * retry send (<1s) -------- queue is full, the client is stalled * block reads */ static int dispatch_connection(connection *conn, dispatcher *self) { char *p, *q, *firstspace, *lastnl; int len; struct timeval start, stop; gettimeofday(&start, NULL); /* first try to resume any work being blocked */ if (dispatch_process_dests(conn, self, start) == 0) { gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); conn->takenby = 0; return 0; } gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); /* don't poll (read) when the last time we ran nothing happened, * this is to avoid excessive CPU usage, issue #126 */ if (!conn->hadwork && timediff(conn->lastwork, start) < 100 * 1000) { conn->takenby = 0; return 0; } conn->hadwork = 0; gettimeofday(&start, NULL); len = -2; /* try to read more data, if that succeeds, or we still have data * left in the buffer, try to process the buffer */ if ( (!conn->needmore && conn->buflen > 0) || (len = read(conn->sock, conn->buf + conn->buflen, (sizeof(conn->buf) - 1) - conn->buflen)) > 0 ) { if (len > 0) conn->buflen += len; /* metrics look like this: metric_path value timestamp\n * due to various messups we need to sanitise the * metrics_path here, to ensure we can calculate the metric * name off the filesystem path (and actually retrieve it in * the web interface). */ q = conn->metric; firstspace = NULL; lastnl = NULL; for (p = conn->buf; p - conn->buf < conn->buflen; p++) { if (*p == '\n' || *p == '\r') { /* end of metric */ lastnl = p; /* just a newline on it's own? some random garbage? skip */ if (q == conn->metric || firstspace == NULL) { q = conn->metric; firstspace = NULL; continue; } self->metrics++; *q++ = '\n'; *q = '\0'; /* can do this because we substract one from buf */ /* perform routing of this metric */ self->blackholes += router_route( conn->dests, &conn->destlen, CONN_DESTS_SIZE, conn->srcaddr, conn->metric, firstspace, self->routes); /* restart building new one from the start */ q = conn->metric; firstspace = NULL; conn->hadwork = 1; gettimeofday(&conn->lastwork, NULL); /* send the metric to where it is supposed to go */ if (dispatch_process_dests(conn, self, conn->lastwork) == 0) break; } else if (*p == ' ' || *p == '\t' || *p == '.') { /* separator */ if (q == conn->metric) { /* make sure we skip this on next iteration to * avoid an infinite loop, issues #8 and #51 */ lastnl = p; continue; } if (*p == '\t') *p = ' '; if (*p == ' ' && firstspace == NULL) { if (*(q - 1) == '.') q--; /* strip trailing separator */ firstspace = q; *q++ = ' '; } else { /* metric_path separator or space, * - duplicate elimination * - don't start with separator/space */ if (*(q - 1) != *p && (q - 1) != firstspace) *q++ = *p; } } else if (firstspace != NULL || (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || (*p >= '0' && *p <= '9') || strchr(self->allowed_chars, *p)) { /* copy char */ *q++ = *p; } else { /* something barf, replace by underscore */ *q++ = '_'; } } conn->needmore = q != conn->metric; if (lastnl != NULL) { /* move remaining stuff to the front */ conn->buflen -= lastnl + 1 - conn->buf; memmove(conn->buf, lastnl + 1, conn->buflen); } } gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); if (len == -1 && (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)) { /* nothing available/no work done */ if (!conn->noexpire && timediff(conn->lastwork, stop) > IDLE_DISCONNECT_TIME) { /* force close connection below */ len = 0; } else { conn->takenby = 0; return 0; } } if (len == -1 || len == 0) { /* error + EOF */ /* we also disconnect the client in this case if our reading * buffer is full, but we still need more (read returns 0 if the * size argument is 0) -> this is good, because we can't do much * with such client */ if (conn->noexpire) { /* reset buffer only (UDP) and move on */ conn->needmore = 1; conn->buflen = 0; conn->takenby = 0; return 0; } else { closedconnections++; close(conn->sock); /* flag this connection as no longer in use */ conn->takenby = -1; return 0; } } /* "release" this connection again */ conn->takenby = 0; return 1; } /** * pthread compatible routine that handles connections and processes * whatever comes in on those. */ static void * dispatch_runner(void *arg) { dispatcher *self = (dispatcher *)arg; connection *conn; int work; int c; self->metrics = 0; self->blackholes = 0; self->ticks = 0; self->prevmetrics = 0; self->prevblackholes = 0; self->prevticks = 0; self->state = SLEEPING; if (self->type == LISTENER) { struct pollfd ufds[sizeof(listeners) / sizeof(connection *)]; while (self->keep_running) { for (c = 0; c < sizeof(listeners) / sizeof(connection *); c++) { if (listeners[c] == NULL) break; ufds[c].fd = listeners[c]->sock; ufds[c].events = POLLIN; } if (poll(ufds, c, 1000) > 0) { for (--c; c >= 0; c--) { if (ufds[c].revents & POLLIN) { int client; struct sockaddr addr; socklen_t addrlen = sizeof(addr); if ((client = accept(ufds[c].fd, &addr, &addrlen)) < 0) { logerr("dispatch: failed to " "accept() new connection: %s\n", strerror(errno)); dispatch_check_rlimit_and_warn(); continue; } if (dispatch_addconnection(client) == -1) { close(client); continue; } } } } } } else if (self->type == CONNECTION) { while (self->keep_running) { work = 0; if (self->route_refresh_pending) { self->routes = self->pending_routes; self->pending_routes = NULL; self->route_refresh_pending = 0; self->hold = 0; } pthread_rwlock_rdlock(&connectionslock); for (c = 0; c < connectionslen; c++) { conn = &(connections[c]); /* atomically try to "claim" this connection */ if (!__sync_bool_compare_and_swap(&(conn->takenby), 0, self->id)) continue; if (self->hold && !conn->isaggr) { conn->takenby = 0; continue; } self->state = RUNNING; work += dispatch_connection(conn, self); } pthread_rwlock_unlock(&connectionslock); self->state = SLEEPING; /* nothing done, avoid spinlocking */ if (self->keep_running && work == 0) usleep((100 + (rand() % 200)) * 1000); /* 100ms - 300ms */ } } else { logerr("huh? unknown self type!\n"); } return NULL; } /** * Starts a new dispatcher for the given type and with the given id. * Returns its handle. */ static dispatcher * dispatch_new(char id, enum conntype type, route *routes, char *allowed_chars) { dispatcher *ret = malloc(sizeof(dispatcher)); if (ret == NULL) return NULL; ret->id = id; ret->type = type; ret->keep_running = 1; ret->routes = routes; ret->route_refresh_pending = 0; ret->hold = 0; ret->allowed_chars = allowed_chars; if (pthread_create(&ret->tid, NULL, dispatch_runner, ret) != 0) { free(ret); return NULL; } return ret; } static char globalid = 0; /** * Starts a new dispatcher specialised in handling incoming connections * (and putting them on the queue for handling the connections). */ dispatcher * dispatch_new_listener(void) { char id = globalid++; return dispatch_new(id, LISTENER, NULL, NULL); } /** * Starts a new dispatcher specialised in handling incoming data on * existing connections. */ dispatcher * dispatch_new_connection(route *routes, char *allowed_chars) { char id = globalid++; return dispatch_new(id, CONNECTION, routes, allowed_chars); } /** * Signals this dispatcher to stop whatever it's doing. */ void dispatch_stop(dispatcher *d) { d->keep_running = 0; } /** * Shuts down and frees up dispatcher d. Returns when the dispatcher * has terminated. */ void dispatch_shutdown(dispatcher *d) { dispatch_stop(d); pthread_join(d->tid, NULL); free(d); } /** * Requests this dispatcher to stop processing connections. As soon as * schedulereload finishes reloading the routes, this dispatcher will * un-hold and continue processing connections. * Returns when the dispatcher is no longer doing work. */ inline void dispatch_hold(dispatcher *d) { d->hold = 1; } /** * Schedules routes r to be put in place for the current routes. The * replacement is performed at the next cycle of the dispatcher. */ inline void dispatch_schedulereload(dispatcher *d, route *r) { d->pending_routes = r; d->route_refresh_pending = 1; } /** * Returns true if the routes scheduled to be reloaded by a call to * dispatch_schedulereload() have been activated. */ inline char dispatch_reloadcomplete(dispatcher *d) { return d->route_refresh_pending == 0; } /** * Returns the wall-clock time in milliseconds consumed by this dispatcher. */ inline size_t dispatch_get_ticks(dispatcher *self) { return self->ticks; } /** * Returns the wall-clock time consumed since last call to this * function. */ inline size_t dispatch_get_ticks_sub(dispatcher *self) { size_t d = self->ticks - self->prevticks; self->prevticks += d; return d; } /** * Returns the number of metrics dispatched since start. */ inline size_t dispatch_get_metrics(dispatcher *self) { return self->metrics; } /** * Returns the number of metrics dispatched since last call to this * function. */ inline size_t dispatch_get_metrics_sub(dispatcher *self) { size_t d = self->metrics - self->prevmetrics; self->prevmetrics += d; return d; } /** * Returns the number of metrics that were explicitly or implicitly * blackholed since start. */ inline size_t dispatch_get_blackholes(dispatcher *self) { return self->blackholes; } /** * Returns the number of metrics that were blackholed since last call to * this function. */ inline size_t dispatch_get_blackholes_sub(dispatcher *self) { size_t d = self->blackholes - self->prevblackholes; self->prevblackholes += d; return d; } /** * Returns whether this dispatcher is currently running, or not. A * dispatcher is running when it is actively handling a connection, and * all tasks related to getting the data received in the place where it * should be. */ inline char dispatch_busy(dispatcher *self) { return self->state == RUNNING; } /** * Returns the number of accepted connections thusfar. */ size_t dispatch_get_accepted_connections(void) { return acceptedconnections; } /** * Returns the number of closed connections thusfar. */ size_t dispatch_get_closed_connections(void) { return closedconnections; } carbon-c-relay-1.7/dispatcher.h000066400000000000000000000033041265266732300164720ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef DISPATCHER_H #define DISPATCHER_H 1 #include #include "router.h" typedef struct _dispatcher dispatcher; void dispatch_check_rlimit_and_warn(void); int dispatch_addlistener(int sock); int dispatch_addlistener_udp(int sock); void dispatch_removelistener(int sock); int dispatch_addconnection(int sock); int dispatch_addconnection_aggr(int sock); dispatcher *dispatch_new_listener(void); dispatcher *dispatch_new_connection(route *routes, char *allowed_chars); void dispatch_stop(dispatcher *d); void dispatch_shutdown(dispatcher *d); size_t dispatch_get_ticks(dispatcher *self); size_t dispatch_get_metrics(dispatcher *self); size_t dispatch_get_blackholes(dispatcher *self); size_t dispatch_get_ticks_sub(dispatcher *self); size_t dispatch_get_metrics_sub(dispatcher *self); size_t dispatch_get_blackholes_sub(dispatcher *self); char dispatch_busy(dispatcher *self); size_t dispatch_get_accepted_connections(void); size_t dispatch_get_closed_connections(void); void dispatch_hold(dispatcher *d); void dispatch_schedulereload(dispatcher *d, route *r); char dispatch_reloadcomplete(dispatcher *d); #endif carbon-c-relay-1.7/fnv1a.h000066400000000000000000000025131265266732300153600ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #define FNV1A_32_OFFSET 2166136261UL #define FNV1A_32_PRIME 16777619 /** * 32-bits unsigned FNV1a returning into hash, using p to as variable to * walk over metric up to firstspace */ #define fnv1a_32(hash, p, metric, firstspace) \ hash = FNV1A_32_OFFSET; \ for (p = metric; p < firstspace; p++) \ hash = (hash ^ (unsigned int)*p) * FNV1A_32_PRIME; #define FNV1A_64_OFFSET 14695981039346656037ULL #define FNV1A_64_PRIME 1099511628211UL /** * 64-bits unsigned FNV1a returning into hash, using p to as variable to * walk over metric up to firstspace */ #define fnv1a_64(hash, p, metric, firstspace) \ hash = FNV1A_64_OFFSET; \ for (p = metric; p < firstspace; p++) \ hash = (hash ^ (unsigned long long int)*p) * FNV1A_64_PRIME; carbon-c-relay-1.7/issues/000077500000000000000000000000001265266732300155065ustar00rootroot00000000000000carbon-c-relay-1.7/issues/distributiontest.c000066400000000000000000000074201265266732300212740ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* This is a clumpsy program to test the distribution of a certain input * set of metrics. See also: * https://github.com/graphite-project/carbon/issues/485 * * compile using something like this: * clang -o distributiontest -I. issues/distributiontest.c consistent-hash.c \ * server.c queue.c md5.c dispatcher.c router.c aggregator.c -pthread -lm */ #include #include #include #include #include #include #include "consistent-hash.h" #include "router.h" #include "server.h" #include "relay.h" #include "md5.h" #define SRVCNT 8 #define REPLCNT 2 enum rmode mode = NORMAL; int relaylog(enum logdst dest, const char *fmt, ...) { (void) dest; (void) fmt; return 0; } int main(int argc, char *argv[]) { FILE *f; int i, j; ch_ring *r; char buf[METRIC_BUFSIZ]; server *s[SRVCNT]; size_t scnt[SRVCNT]; size_t metrics; time_t start, stop; size_t min, max; double mean, stddev; if (argc < 2) { fprintf(stderr, "need file argument\n"); return 1; } if (strcmp(argv[1], "-") == 0) { f = stdin; } else { f = fopen(argv[1], "r"); } if (f == NULL) { fprintf(stderr, "failed to open '%s': %s\n", argv[1], strerror(errno)); return 1; } /* build hash-ring */ r = ch_new(JUMP_FNV1a); /* or CARBON, FNV1a */ for (i = 0; i < SRVCNT; i++) { char ip[24]; unsigned char md5[MD5_DIGEST_LENGTH]; char md5sum[MD5_DIGEST_LENGTH * 2 + 1]; snprintf(ip, sizeof(ip), "192.168.%d.%d", i, 10 + (i * 2)); s[i] = server_new( ip, 2003, CON_TCP, NULL, 1024, 128, 800 ); MD5((unsigned char *)ip, strlen(ip), md5); for (j = 0; j < MD5_DIGEST_LENGTH; j++) { snprintf(md5sum + (j * 2), 3, "%02x", md5[j]); } /* to use instance, enable this */ if (1 == 0) server_set_instance(s[i], md5sum); r = ch_addnode(r, s[i]); } /* process input */ memset(scnt, 0, sizeof(size_t) * SRVCNT); metrics = 0; start = time(NULL); while (fgets(buf, METRIC_BUFSIZ, f) != NULL) { destination dst[REPLCNT]; metrics++; ch_get_nodes( dst, r, REPLCNT, buf, buf + strlen(buf) - 1 ); for (i = 0; i < REPLCNT; i++) { free((void *)dst[i].metric); for (j = 0; j < SRVCNT; j++) { if (s[j] == dst[i].dest) { scnt[j]++; break; } } } } stop = time(NULL); mean = 0.0; stddev = 0.0; if (stop == start) { stop++; } printf("total metrics processed: %zd, time spent: ~%ds (~%d/s)\n", metrics, (int)(stop - start), (int)(metrics / (stop - start))); printf("replication count: %d, server count: %d\n", REPLCNT, SRVCNT); printf("server distribution:\n"); for (i = 0; i < SRVCNT; i++) { printf("- server %15s: %6zd (%.2f%%)\n", server_ip(s[i]), scnt[i], ((double)scnt[i] * 100.0) / (double)metrics); if (i == 0) { min = max = scnt[i]; } else { if (scnt[i] < min) min = scnt[i]; if (scnt[i] > max) max = scnt[i]; } mean += (double)scnt[i]; } mean /= (double)SRVCNT; for (i = 0; i < SRVCNT; i++) { stddev += pow((double)scnt[i] - mean, 2); } stddev = sqrt(stddev / (double)SRVCNT); printf("band: %zd - %zd (diff %zd, %.1f%%), mean: %.2f, stddev: %.2f\n", min, max, max - min, (double)(max-min) * 100.0 / (mean * SRVCNT), mean, stddev); } carbon-c-relay-1.7/issues/issue10.conf000066400000000000000000000020531265266732300176460ustar00rootroot00000000000000rewrite ^servers\.(cloud|bc|cwwtf|telhc)\.(int|test|stage|live|eng|green|mgmt)\.([a-zA-Z]+)([0-9]+) into servers.\1.\2.\3.\3\4 ; #remove derive, objects and gauges, etc. in keys (unless for applications (statsd)) # regex implementation doesn't understand this one #rewrite ^(.*)\.(? 1: arg = sys.argv[1] if arg.isdigit(): delay = int(arg) else: sys.stderr.write("Ignoring non-integer argument. Using default: %ss\n" % delay) sock = socket.socket() try: sock.connect( (CARBON_SERVER, CARBON_PORT) ) except socket.error: raise SystemExit("Couldn't connect to %(server)s on port %(port)d, is carbon-cache.py running?" % { 'server':CARBON_SERVER, 'port':CARBON_PORT }) try: run(sock, delay) except KeyboardInterrupt: sys.stderr.write("\nExiting on CTRL-c\n") sys.exit(0) if __name__ == "__main__": main() carbon-c-relay-1.7/issues/issue60.conf000066400000000000000000000007521265266732300176570ustar00rootroot00000000000000match ^metrics.all.* send to blackhole stop ; aggregate metrics.*.api.ac\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.([^.]+)\.count every 10 seconds expire after 50 seconds compute sum write to metrics.all.api.ac.\1.\2.\3.\4.\5.count compute sum write to metrics.all.api.ac.\1.\2.all.\4.\5.count compute sum write to metrics.all.api.ac.\1.\2.\3.all.\5.count compute sum write to metrics.all.api.ac.\1.\2.all.all.\5.count ; carbon-c-relay-1.7/issues/issue82.conf000066400000000000000000000007751265266732300176700ustar00rootroot00000000000000cluster local_carbon carbon_ch 127.0.0.1:2013=a ; match * send to local_carbon stop ; aggregate ^sys\.dc[0-9].(somehost-[0-9]+)\.([^.]+)\.mysql\.replication_delay every 10 seconds expire after 35 seconds compute sum write to mysql.host.\1.replication_delay compute sum write to mysql.host.all.replication_delay compute sum write to mysql.cluster.\2.replication_delay compute sum write to mysql.cluster.all.replication_delay ; carbon-c-relay-1.7/issues/issue94-2.conf000066400000000000000000000004331265266732300200210ustar00rootroot00000000000000cluster graphite forward 127.0.0.1:2004 ; aggregate ^sys\.somemetric every 60 seconds expire after 75 seconds timestamp at end of bucket compute sum write to sys.somemetric send to graphite stop ; match * send to graphite ; carbon-c-relay-1.7/issues/issue94-3.conf000066400000000000000000000025301265266732300200220ustar00rootroot00000000000000############################################################################### # # Define clusters that we will manage. # ############################################################################### cluster local forward 127.0.0.1:2004 ; ############################################################################### # # Define any rewrite rules you might want. # ############################################################################### # carbon-c-relay metrics should go under system. rewrite ^carbon\.(.*) into system.carbon.\1 ; ############################################################################### # # Define Aggregations we want to use. # ############################################################################### #access-logs page counts aggregate ^app\.([^.]+)\.dc\.([^.]+)\.env\.([^.]+)\.store\.([^.]+)\.pages\.([^.]+)\.count every 60 seconds expire after 90 seconds compute sum write to aggregated_app.\1.dc.\2.env.\3.store.\4.pages.\5.count send to local ; ############################################################################### # # Send carbon metrics home. If you want another metric to be sent do it here. # ############################################################################### match ^system\.carbon\.(.*) send to local stop ; carbon-c-relay-1.7/issues/issue94-4.conf000066400000000000000000000004641265266732300200270ustar00rootroot00000000000000cluster reagg fnv1a_ch 127.0.0.1:2004=a 127.0.0.2:2004=b 127.0.0.3:2004=c 127.0.0.4:2004=d ; cluster b forward 127.0.0.1:2005; aggregate ^(.+).([^.]+).([^.]+)$ every 10 seconds expire after 30 seconds timestamp at end of bucket compute sum write to agg.\1.\2.sumall.hosts send to reagg b stop ; carbon-c-relay-1.7/issues/issue94.conf000066400000000000000000000011271265266732300176630ustar00rootroot00000000000000cluster aggregator-post failover 127.0.0.1:2004 127.0.0.1:2005 ; rewrite ^carbon\.(.*) into system.carbon.\1 ; aggregate ^app\.([^.]+)\.dc\.([^.]+)\.env\.([^.]+)\.store\.([^.]+)\.pages\.([^.]+)\.count every 60 seconds expire after 95 seconds timestamp at end of bucket compute sum write to aggregated_app.\1.dc.\2.env.\3.store.\4.pages.\5.count ; match ^system\.carbon\.(.*) send to aggregator-post stop ; match ^aggregated_(.*) send to aggregator-post stop ; match * send to blackhole stop ; carbon-c-relay-1.7/md5.c000066400000000000000000000211111265266732300150200ustar00rootroot00000000000000/* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * * Homepage: * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * * Author: * Alexander Peslyak, better known as Solar Designer * * This software was written by Alexander Peslyak in 2001. No copyright is * claimed, and the software is hereby placed in the public domain. * In case this attempt to disclaim copyright and place the software in the * public domain is deemed null and void, then the software is * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the * general public under the following terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted. * * There's ABSOLUTELY NO WARRANTY, express or implied. * * (This is a heavily cut-down "BSD license".) * * This differs from Colin Plumb's older public domain implementation in that * no exactly 32-bit integer data type is required (any 32-bit or wider * unsigned integer data type will do), there's no compile-time endianness * configuration, and the function prototypes match OpenSSL's. No code from * Colin Plumb's implementation has been reused; this comment merely compares * the properties of the two independent implementations. * * The primary goals of this implementation are portability and ease of use. * It is meant to be fast, but not as fast as possible. Some known * optimizations are not included to reduce source code size and avoid * compile-time configuration. */ #ifndef HAVE_OPENSSL #include #include "md5.h" /* * The basic MD5 functions. * * F and G are optimized compared to their RFC 1321 definitions for * architectures that lack an AND-NOT instruction, just like in Colin Plumb's * implementation. */ #define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) #define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) #define H(x, y, z) (((x) ^ (y)) ^ (z)) #define H2(x, y, z) ((x) ^ ((y) ^ (z))) #define I(x, y, z) ((y) ^ ((x) | ~(z))) /* * The MD5 transformation for all four rounds. */ #define STEP(f, a, b, c, d, x, t, s) \ (a) += f((b), (c), (d)) + (x) + (t); \ (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ (a) += (b); /* * SET reads 4 input bytes in little-endian byte order and stores them * in a properly aligned word in host byte order. * * The check for little-endian architectures that tolerate unaligned * memory accesses is just an optimization. Nothing will break if it * doesn't work. */ #if defined(__i386__) || defined(__x86_64__) || defined(__vax__) #define SET(n) \ (*(MD5_u32plus *)&ptr[(n) * 4]) #define GET(n) \ SET(n) #else #define SET(n) \ (ctx->block[(n)] = \ (MD5_u32plus)ptr[(n) * 4] | \ ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \ ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \ ((MD5_u32plus)ptr[(n) * 4 + 3] << 24)) #define GET(n) \ (ctx->block[(n)]) #endif /* * This processes one or more 64-byte data blocks, but does NOT update * the bit counters. There are no alignment requirements. */ static const void *body(MD5_CTX *ctx, const void *data, unsigned long size) { const unsigned char *ptr; MD5_u32plus a, b, c, d; MD5_u32plus saved_a, saved_b, saved_c, saved_d; ptr = (const unsigned char *)data; a = ctx->a; b = ctx->b; c = ctx->c; d = ctx->d; do { saved_a = a; saved_b = b; saved_c = c; saved_d = d; /* Round 1 */ STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7) STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12) STEP(F, c, d, a, b, SET(2), 0x242070db, 17) STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22) STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7) STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12) STEP(F, c, d, a, b, SET(6), 0xa8304613, 17) STEP(F, b, c, d, a, SET(7), 0xfd469501, 22) STEP(F, a, b, c, d, SET(8), 0x698098d8, 7) STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12) STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17) STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22) STEP(F, a, b, c, d, SET(12), 0x6b901122, 7) STEP(F, d, a, b, c, SET(13), 0xfd987193, 12) STEP(F, c, d, a, b, SET(14), 0xa679438e, 17) STEP(F, b, c, d, a, SET(15), 0x49b40821, 22) /* Round 2 */ STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5) STEP(G, d, a, b, c, GET(6), 0xc040b340, 9) STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14) STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20) STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5) STEP(G, d, a, b, c, GET(10), 0x02441453, 9) STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14) STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20) STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5) STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9) STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14) STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20) STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5) STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9) STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14) STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20) /* Round 3 */ STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4) STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11) STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16) STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23) STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4) STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11) STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16) STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23) STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4) STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11) STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16) STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23) STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4) STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11) STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16) STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23) /* Round 4 */ STEP(I, a, b, c, d, GET(0), 0xf4292244, 6) STEP(I, d, a, b, c, GET(7), 0x432aff97, 10) STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15) STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21) STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6) STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10) STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15) STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21) STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6) STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10) STEP(I, c, d, a, b, GET(6), 0xa3014314, 15) STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21) STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6) STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10) STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15) STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21) a += saved_a; b += saved_b; c += saved_c; d += saved_d; ptr += 64; } while (size -= 64); ctx->a = a; ctx->b = b; ctx->c = c; ctx->d = d; return ptr; } void MD5_Init(MD5_CTX *ctx) { ctx->a = 0x67452301; ctx->b = 0xefcdab89; ctx->c = 0x98badcfe; ctx->d = 0x10325476; ctx->lo = 0; ctx->hi = 0; } void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size) { MD5_u32plus saved_lo; unsigned long used, available; saved_lo = ctx->lo; if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo) ctx->hi++; ctx->hi += size >> 29; used = saved_lo & 0x3f; if (used) { available = 64 - used; if (size < available) { memcpy(&ctx->buffer[used], data, size); return; } memcpy(&ctx->buffer[used], data, available); data = (const unsigned char *)data + available; size -= available; body(ctx, ctx->buffer, 64); } if (size >= 64) { data = body(ctx, data, size & ~(unsigned long)0x3f); size &= 0x3f; } memcpy(ctx->buffer, data, size); } void MD5_Final(unsigned char *result, MD5_CTX *ctx) { unsigned long used, available; used = ctx->lo & 0x3f; ctx->buffer[used++] = 0x80; available = 64 - used; if (available < 8) { memset(&ctx->buffer[used], 0, available); body(ctx, ctx->buffer, 64); used = 0; available = 64; } memset(&ctx->buffer[used], 0, available - 8); ctx->lo <<= 3; ctx->buffer[56] = ctx->lo; ctx->buffer[57] = ctx->lo >> 8; ctx->buffer[58] = ctx->lo >> 16; ctx->buffer[59] = ctx->lo >> 24; ctx->buffer[60] = ctx->hi; ctx->buffer[61] = ctx->hi >> 8; ctx->buffer[62] = ctx->hi >> 16; ctx->buffer[63] = ctx->hi >> 24; body(ctx, ctx->buffer, 64); result[0] = ctx->a; result[1] = ctx->a >> 8; result[2] = ctx->a >> 16; result[3] = ctx->a >> 24; result[4] = ctx->b; result[5] = ctx->b >> 8; result[6] = ctx->b >> 16; result[7] = ctx->b >> 24; result[8] = ctx->c; result[9] = ctx->c >> 8; result[10] = ctx->c >> 16; result[11] = ctx->c >> 24; result[12] = ctx->d; result[13] = ctx->d >> 8; result[14] = ctx->d >> 16; result[15] = ctx->d >> 24; memset(ctx, 0, sizeof(*ctx)); } unsigned char *MD5(const void *data, unsigned long size, unsigned char *result) { MD5_CTX ctx; MD5_Init(&ctx); MD5_Update(&ctx, data, size); MD5_Final(result, &ctx); return result; } #endif carbon-c-relay-1.7/md5.h000066400000000000000000000027701265266732300150370ustar00rootroot00000000000000/* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * * Homepage: * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * * Author: * Alexander Peslyak, better known as Solar Designer * * This software was written by Alexander Peslyak in 2001. No copyright is * claimed, and the software is hereby placed in the public domain. * In case this attempt to disclaim copyright and place the software in the * public domain is deemed null and void, then the software is * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the * general public under the following terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted. * * There's ABSOLUTELY NO WARRANTY, express or implied. * * See md5.c for more information. */ #ifdef HAVE_OPENSSL #include #elif !defined(_MD5_H) #define _MD5_H #define MD5_DIGEST_LENGTH 16 /* Any 32-bit or wider unsigned integer data type will do */ typedef unsigned int MD5_u32plus; typedef struct { MD5_u32plus lo, hi; MD5_u32plus a, b, c, d; unsigned char buffer[64]; MD5_u32plus block[16]; } MD5_CTX; extern void MD5_Init(MD5_CTX *ctx); extern void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size); extern void MD5_Final(unsigned char *result, MD5_CTX *ctx); extern unsigned char *MD5(const void *data, unsigned long size, unsigned char *result); #endif carbon-c-relay-1.7/queue.c000066400000000000000000000113161265266732300154650ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "queue.h" struct _queue { const char **queue; size_t end; size_t write; size_t read; size_t len; pthread_mutex_t lock; }; /** * Allocates a new queue structure with capacity to hold size elements. */ queue * queue_new(size_t size) { queue *ret = malloc(sizeof(queue)); if (ret == NULL) return NULL; ret->queue = malloc(sizeof(char *) * size); if (ret->queue == NULL) { free(ret); return NULL; } memset(ret->queue, 0, sizeof(char *) * size); ret->end = size; ret->read = ret->write = 0; ret->len = 0; pthread_mutex_init(&ret->lock, NULL); return ret; } /** * Frees up allocated resources in use by the queue. This doesn't take * into account any consumers at all. That is, the caller needs to * ensure noone is using the queue any more. */ void queue_destroy(queue *q) { q->len = 0; pthread_mutex_destroy(&q->lock); free(q->queue); free(q); } /** * Enqueues the string pointed to by p at queue q. If the queue is * full, the oldest entry is dropped. For this reason, enqueuing will * never fail. This function assumes the pointer p is a private copy * for this queue, and hence will be freed once processed. */ void queue_enqueue(queue *q, const char *p) { /* queue normal: * |=====-----------------------------| 4 * ^ ^ * r w * queue wrap: * |===---------------------------====| 6 * ^ ^ * w r * queue full * |==================================| 23 * ^ * w+r */ pthread_mutex_lock(&q->lock); if (q->len == q->end) { if (q->read == q->end) q->read = 0; free((char *)(q->queue[q->read])); q->read++; q->len--; } if (q->write == q->end) q->write = 0; q->queue[q->write] = p; q->write++; q->len++; pthread_mutex_unlock(&q->lock); } /** * Returns the oldest entry in the queue. If there are no entries, NULL * is returned. The caller should free the returned string. */ const char * queue_dequeue(queue *q) { const char *ret; pthread_mutex_lock(&q->lock); if (q->len == 0) { pthread_mutex_unlock(&q->lock); return NULL; } if (q->read == q->end) q->read = 0; ret = q->queue[q->read++]; q->len--; pthread_mutex_unlock(&q->lock); return ret; } /** * Returns at most len elements from the queue. Attempts to use a * single lock to read a vector of elements from the queue to minimise * effects of locking. Returns the number of elements stored in ret. * The caller is responsible for freeing elements from ret, as well as * making sure it is large enough to store len elements. */ size_t queue_dequeue_vector(const char **ret, queue *q, size_t len) { size_t i; pthread_mutex_lock(&q->lock); if (q->len == 0) { pthread_mutex_unlock(&q->lock); return 0; } if (len > q->len) len = q->len; for (i = 0; i < len; i++) { if (q->read == q->end) q->read = 0; ret[i] = q->queue[q->read++]; } q->len -= len; pthread_mutex_unlock(&q->lock); return len; } /** * Puts the entry p at the front of the queue, instead of the end, if * there is space available in the queue. Returns 0 when no space is * available, non-zero otherwise. Like queue_enqueue, * queue_putback assumes pointer p points to a private copy for the * queue. */ char queue_putback(queue *q, const char *p) { pthread_mutex_lock(&q->lock); if (q->len == q->end) { pthread_mutex_unlock(&q->lock); return 0; } if (q->read == 0) q->read = q->end; q->read--; q->queue[q->read] = p; q->len++; pthread_mutex_unlock(&q->lock); return 1; } /** * Returns the (approximate) size of entries waiting to be read in the * queue. The returned value cannot be taken accurate with multiple * readers/writers concurrently in action. Hence it can only be seen as * mere hint about the state of the queue. */ inline size_t queue_len(queue *q) { return q->len; } /** * Returns the (approximate) size of free entries in the queue. The * same conditions as for queue_len apply. */ inline size_t queue_free(queue *q) { return q->end - q->len; } /** * Returns the size of the queue. */ inline size_t queue_size(queue *q) { return q->end; } carbon-c-relay-1.7/queue.h000066400000000000000000000020221265266732300154640ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef QUEUE_H #define QUEUE_H 1 #include typedef struct _queue queue; queue* queue_new(size_t size); void queue_destroy(queue *q); void queue_enqueue(queue *q, const char *p); const char *queue_dequeue(queue *q); size_t queue_dequeue_vector(const char **ret, queue *q, size_t len); char queue_putback(queue *q, const char *p); size_t queue_len(queue *q); size_t queue_free(queue *q); size_t queue_size(queue *q); #endif carbon-c-relay-1.7/receptor.c000066400000000000000000000115071265266732300161660ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include "relay.h" #ifndef TMPDIR # define TMPDIR "/tmp" #endif #define SOCKFILE ".s.carbon-c-relay" /** * Opens up listener sockets. Returns the socket fds in ret, and * updates retlen. If opening sockets failed, -1 is returned. The * caller should ensure retlen is at least 1, and ret should be an array * large enough to hold it. */ int bindlisten( int ret_stream[], int *retlen_stream, int ret_dgram[], int *retlen_dgram, const char *interface, unsigned short port, unsigned int backlog) { int sock; int optval; struct timeval tv; struct addrinfo hint; struct addrinfo *res, *resw; char buf[128]; char saddr[INET6_ADDRSTRLEN]; int err; int curlen_stream = 0; int curlen_dgram = 0; int socktypes[] = {SOCK_STREAM, SOCK_DGRAM, 0}; int *socktype = socktypes; tv.tv_sec = 0; tv.tv_usec = 500 * 1000; for (; *socktype != 0; socktype++) { memset(&hint, 0, sizeof(hint)); hint.ai_family = PF_UNSPEC; hint.ai_socktype = *socktype; hint.ai_protocol = 0; hint.ai_flags = AI_ADDRCONFIG | AI_NUMERICSERV | AI_PASSIVE; snprintf(buf, sizeof(buf), "%u", port); if ((err = getaddrinfo(interface, buf, &hint, &res)) != 0) { logerr("getaddrinfo(%s, %s, ...) failed: %s\n", interface == NULL ? "NULL" : interface, buf, gai_strerror(err)); return -1; } for (resw = res; resw != NULL; resw = resw->ai_next) { if (resw->ai_family != PF_INET && resw->ai_family != PF_INET6) continue; if (resw->ai_protocol != IPPROTO_TCP && resw->ai_protocol != IPPROTO_UDP) continue; if ((sock = socket(resw->ai_family, resw->ai_socktype, resw->ai_protocol)) < 0) continue; (void) setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); optval = 1; /* allow takeover */ (void) setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)); if (resw->ai_family == PF_INET6) { optval = 1; (void) setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, &optval, sizeof(optval)); } if (bind(sock, resw->ai_addr, resw->ai_addrlen) < 0) { close(sock); continue; } snprintf(saddr, sizeof(saddr), "(unknown)"); switch (resw->ai_family) { case PF_INET: inet_ntop(resw->ai_family, &((struct sockaddr_in *)resw->ai_addr)->sin_addr, saddr, sizeof(saddr)); break; case PF_INET6: inet_ntop(resw->ai_family, &((struct sockaddr_in6 *)resw->ai_addr)->sin6_addr, saddr, sizeof(saddr)); break; } if (resw->ai_protocol == IPPROTO_TCP) { if (listen(sock, backlog) < 0) { close(sock); continue; } if (curlen_stream < *retlen_stream) { logout("listening on tcp%d %s port %s\n", resw->ai_family == PF_INET6 ? 6 : 4, saddr, buf); ret_stream[curlen_stream++] = sock; } } else { if (curlen_dgram < *retlen_dgram) { logout("listening on udp%d %s port %s\n", resw->ai_family == PF_INET6 ? 6 : 4, saddr, buf); ret_dgram[curlen_dgram++] = sock; } } } freeaddrinfo(res); } if (curlen_stream + curlen_dgram == 0) return -1; /* fake loop to simplify breakout below */ while (curlen_stream < *retlen_stream) { struct sockaddr_un server; #ifndef PF_LOCAL # define PF_LOCAL PF_UNIX #endif if ((sock = socket(PF_LOCAL, SOCK_STREAM, 0)) < 0) break; snprintf(buf, sizeof(buf), "%s/%s.%u", TMPDIR, SOCKFILE, port); memset(&server, 0, sizeof(struct sockaddr_un)); server.sun_family = PF_LOCAL; strncpy(server.sun_path, buf, sizeof(server.sun_path) - 1); unlink(buf); /* avoid address already in use */ if (bind(sock, (struct sockaddr *)&server, sizeof(struct sockaddr_un)) < 0) { logerr("failed to bind for %s: %s\n", buf, strerror(errno)); close(sock); break; } if (listen(sock, backlog) < 0) { close(sock); break; } logout("listening on UNIX socket %s\n", buf); ret_stream[curlen_stream++] = sock; break; } *retlen_stream = curlen_stream; *retlen_dgram = curlen_dgram; return 0; } void destroy_usock(unsigned short port) { char buf[512]; snprintf(buf, sizeof(buf), "%s/%s.%u", TMPDIR, SOCKFILE, port); unlink(buf); } carbon-c-relay-1.7/receptor.h000066400000000000000000000015241265266732300161710ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef RECEPTOR_H #define RECEPTOR_H 1 int bindlisten(int ret_stream[], int *retlen_stream, int ret_dgram[], int *retlen_dgram, const char *interface, unsigned short port, unsigned int backlog); void destroy_usock(unsigned short port); #endif carbon-c-relay-1.7/relay.c000066400000000000000000000426241265266732300154630ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #if defined(ENABLE_OPENMP) # include #endif #include "relay.h" #include "server.h" #include "router.h" #include "receptor.h" #include "dispatcher.h" #include "aggregator.h" #include "collector.h" int keep_running = 1; char relay_hostname[256]; enum rmode mode = NORMAL; static char *config = NULL; static int batchsize = 2500; static int queuesize = 25000; static unsigned short iotimeout = 600; static dispatcher **workers = NULL; static char workercnt = 0; static cluster *clusters = NULL; static route *routes = NULL; static aggregator *aggrs = NULL; static server *internal_submission = NULL; static char *relay_logfile = NULL; static FILE *relay_stdout = NULL; static FILE *relay_stderr = NULL; static char relay_can_log = 0; /** * Writes to the setup output stream, prefixed with a timestamp, and if * the stream is not going to stdout or stderr, prefixed with MSG or * ERR. */ int relaylog(enum logdst dest, const char *fmt, ...) { va_list ap; char prefix[64]; size_t len; time_t now; struct tm *tm_now; FILE *dst = NULL; char console = 0; int ret; switch (dest) { case LOGOUT: dst = relay_stdout; if (dst == stdout) console = 1; break; case LOGERR: dst = relay_stderr; if (dst == stderr) console = 1; break; } assert(dst != NULL); /* briefly stall if we're swapping fds */ while (!relay_can_log) usleep((100 + (rand() % 200)) * 1000); /* 100ms - 300ms */ time(&now); tm_now = localtime(&now); len = strftime(prefix, sizeof(prefix), "[%Y-%m-%d %H:%M:%S]", tm_now); if (!console) (void)snprintf(prefix + len, sizeof(prefix) - len, " (%s)", dest == LOGOUT ? "MSG" : "ERR"); fprintf(dst, "%s ", prefix); va_start(ap, fmt); ret = vfprintf(dst, fmt, ap); fflush(dst); va_end(ap); return ret; } static void exit_handler(int sig) { char *signal = "unknown signal"; switch (sig) { case SIGTERM: signal = "SIGTERM"; break; case SIGINT: signal = "SIGINT"; break; case SIGQUIT: signal = "SIGQUIT"; break; } if (keep_running) { logout("caught %s\n", signal); } else { logerr("caught %s while already shutting down, " "forcing exit!\n", signal); exit(1); } keep_running = 0; } static void hup_handler(int sig) { route *newroutes; cluster *newclusters; aggregator *newaggrs; int id; FILE *newfd; size_t numaggregators; server **servers; size_t i; logout("caught SIGHUP...\n"); if (relay_stderr != stderr) { /* try to re-open the file first, so we can still try and say * something if that fails */ if ((newfd = fopen(relay_logfile, "a")) == NULL) { logerr("not reopening logfiles: can't open '%s': %s\n", relay_logfile, strerror(errno)); } else { logout("closing logfile\n"); relay_can_log = 0; fclose(relay_stderr); relay_stdout = newfd; relay_stderr = newfd; relay_can_log = 1; logout("reopening logfile\n"); } } logout("reloading config from '%s'\n", config); if (router_readconfig(&newclusters, &newroutes, &newaggrs, config, queuesize, batchsize, iotimeout) == 0) { logerr("failed to read configuration '%s', aborting reload\n", config); return; } router_optimise(&newroutes); logout("reloading collector\n"); collector_schedulereload(newclusters, newaggrs); while (!collector_reloadcomplete()) usleep((100 + (rand() % 200)) * 1000); /* 100ms - 300ms */ /* During aggregator final expiry, the dispatchers should not put * new metrics, so we have to temporarily stop them processing * connections. However, we still need to handle aggregations that * we will stop next, so this call results in the dispatchers only * dealing with connections from aggregators. Doing this, also * means that we can reload the config atomicly, which disrupts the * service seemingly for a bit, but results in less confusing output * in the end. */ logout("interrupting workers\n"); for (id = 1; id < 1 + workercnt; id++) dispatch_hold(workers[id]); numaggregators = aggregator_numaggregators(aggrs); if (numaggregators > 0) { logout("expiring aggregations\n"); aggregator_stop(); /* frees aggrs */ /* Now the aggregator has written everything to the * dispatchers, which will hand over to the servers with proper * metric adjustments (stubs). */ } numaggregators = aggregator_numaggregators(newaggrs); if (numaggregators > 0) { if (!aggregator_start(newaggrs)) { logerr("failed to start aggregator, aggregations will no " "longer produce output!\n"); } } logout("reloading workers\n"); for (id = 1; id < 1 + workercnt; id++) dispatch_schedulereload(workers[id], newroutes); /* un-holds */ for (id = 1; id < 1 + workercnt; id++) { while (!dispatch_reloadcomplete(workers[id + 0])) usleep((100 + (rand() % 200)) * 1000); /* 100ms - 300ms */ } servers = router_getservers(clusters); for (i = 0; servers[i] != NULL; i++) server_stop(servers[i]); free(servers); router_free(clusters, routes); routes = newroutes; clusters = newclusters; aggrs = newaggrs; logout("SIGHUP handler complete\n"); } static int get_cores(void) { #if defined(ENABLE_OPENMP) return omp_get_num_procs(); #else return 5; #endif } static void do_version(void) { printf("carbon-c-relay v" VERSION " (" GIT_VERSION ")\n"); exit(0); } static void do_usage(int exitcode) { printf("Usage: relay [-vdst] -f [-p ] [-w ] [-b ] [-q ]\n"); printf("\n"); printf("Options:\n"); printf(" -v print version and exit\n"); printf(" -f read for clusters and routes\n"); printf(" -p listen on for connections, defaults to 2003\n"); printf(" -i listen on for connections, defaults to all\n"); printf(" -l write output to , defaults to stdout/stderr\n"); printf(" -w use worker threads, defaults to %d\n", get_cores()); printf(" -b server send batch size, defaults to 2500\n"); printf(" -q server queue size, defaults to 25000\n"); printf(" -S statistics sending interval in seconds, defaults to 60\n"); printf(" -B connection listen backlog, defaults to 3\n"); printf(" -T IO timeout in milliseconds for server connections, defaults to 600\n"); printf(" -m send statistics like carbon-cache.py, e.g. not cumulative\n"); printf(" -c characters to allow next to [A-Za-z0-9], defaults to -_:#\n"); printf(" -d debug mode: currently writes statistics to log, prints hash\n" " ring contents and matching position in test mode (-t)\n"); printf(" -s submission mode: don't add any metrics to the stream like\n" " statistics, report drop counts and queue pressure to log\n"); printf(" -t config test mode: prints rule matches from input on stdin\n"); printf(" -H hostname: override hostname (used in statistics)\n"); exit(exitcode); } int main(int argc, char * const argv[]) { int stream_sock[] = {0, 0, 0}; /* tcp4, tcp6, UNIX */ int stream_socklen = sizeof(stream_sock) / sizeof(stream_sock[0]); int dgram_sock[] = {0, 0}; /* udp4, udp6 */ int dgram_socklen = sizeof(dgram_sock) / sizeof(dgram_sock[0]); char id; unsigned short listenport = 2003; unsigned int listenbacklog = 3; int ch; size_t numaggregators; char *listeninterface = NULL; server **servers; char *allowed_chars = NULL; int i; enum { SUB, CUM } smode = CUM; if (gethostname(relay_hostname, sizeof(relay_hostname)) < 0) snprintf(relay_hostname, sizeof(relay_hostname), "127.0.0.1"); while ((ch = getopt(argc, argv, ":hvdmstf:i:l:p:w:b:q:S:T:c:H:B:")) != -1) { switch (ch) { case 'v': do_version(); break; case 'd': if (mode == TEST) { mode = DEBUGTEST; } else if (mode == SUBMISSION) { mode = DEBUGSUBMISSION; } else { mode = DEBUG; } break; case 'm': smode = SUB; break; case 's': if (mode == DEBUG) { mode = DEBUGSUBMISSION; } else { mode = SUBMISSION; } break; case 't': if (mode == DEBUG) { mode = DEBUGTEST; } else { mode = TEST; } break; case 'f': config = optarg; break; case 'i': listeninterface = optarg; break; case 'l': relay_logfile = optarg; break; case 'p': listenport = (unsigned short)atoi(optarg); if (listenport == 0) { fprintf(stderr, "error: port needs to be a number >0\n"); do_usage(1); } break; case 'w': workercnt = (char)atoi(optarg); if (workercnt <= 0) { fprintf(stderr, "error: workers needs to be a number >0\n"); do_usage(1); } break; case 'b': batchsize = atoi(optarg); if (batchsize <= 0) { fprintf(stderr, "error: batch size needs to be a number >0\n"); do_usage(1); } break; case 'q': queuesize = atoi(optarg); if (queuesize <= 0) { fprintf(stderr, "error: queue size needs to be a number >0\n"); do_usage(1); } break; case 'S': collector_interval = atoi(optarg); if (collector_interval <= 0) { fprintf(stderr, "error: sending interval needs to be " "a number >0\n"); do_usage(1); } break; case 'T': { int val = atoi(optarg); if (val <= 0) { fprintf(stderr, "error: server IO timeout needs to be a number >0\n"); do_usage(1); } else if (val >= 60000) { fprintf(stderr, "error: server IO timeout needs to be less than one minute\n"); do_usage(1); } iotimeout = (unsigned short)val; } break; case 'c': allowed_chars = optarg; break; case 'H': snprintf(relay_hostname, sizeof(relay_hostname), "%s", optarg); break; case 'B': { int val = atoi(optarg); if (val <= 0) { fprintf(stderr, "error: backlog needs to be a number >0\n"); do_usage(1); } listenbacklog = (unsigned int)val; } break; case '?': case ':': do_usage(1); break; case 'h': default: do_usage(0); break; } } if (optind == 1 || config == NULL) do_usage(1); /* seed randomiser for dispatcher and aggregator "splay" */ srand(time(NULL)); if (workercnt == 0) workercnt = (mode == SUBMISSION || mode == DEBUGSUBMISSION) ? 2 : get_cores(); /* any_of failover maths need batchsize to be smaller than queuesize */ if (batchsize > queuesize) { fprintf(stderr, "error: batchsize must be smaller than queuesize\n"); exit(-1); } if (relay_logfile != NULL && mode != TEST && mode != DEBUGTEST) { FILE *f = fopen(relay_logfile, "a"); if (f == NULL) { fprintf(stderr, "error: failed to open logfile '%s': %s\n", relay_logfile, strerror(errno)); exit(-1); } relay_stdout = f; relay_stderr = f; } else { relay_stdout = stdout; relay_stderr = stderr; } relay_can_log = 1; logout("starting carbon-c-relay v%s (%s), pid=%d\n", VERSION, GIT_VERSION, getpid()); fprintf(relay_stdout, "configuration:\n"); fprintf(relay_stdout, " relay hostname = %s\n", relay_hostname); fprintf(relay_stdout, " listen port = %u\n", listenport); if (listeninterface != NULL) fprintf(relay_stdout, " listen interface = %s\n", listeninterface); fprintf(relay_stdout, " workers = %d\n", workercnt); fprintf(relay_stdout, " send batch size = %d\n", batchsize); fprintf(relay_stdout, " server queue size = %d\n", queuesize); fprintf(relay_stdout, " statistics submission interval = %ds\n", collector_interval); fprintf(relay_stdout, " listen backlog = %u\n", listenbacklog); fprintf(relay_stdout, " server connection IO timeout = %dms\n", iotimeout); if (allowed_chars != NULL) fprintf(relay_stdout, " extra allowed characters = %s\n", allowed_chars); if (mode == DEBUG || mode == DEBUGTEST || mode == DEBUGSUBMISSION) fprintf(relay_stdout, " debug = true\n"); else if (mode == SUBMISSION || mode == DEBUGSUBMISSION) fprintf(relay_stdout, " submission = true\n"); fprintf(relay_stdout, " routes configuration = %s\n", config); fprintf(relay_stdout, "\n"); if (router_readconfig(&clusters, &routes, &aggrs, config, queuesize, batchsize, iotimeout) == 0) { logerr("failed to read configuration '%s'\n", config); return 1; } router_optimise(&routes); numaggregators = aggregator_numaggregators(aggrs); #define dbg (mode == DEBUG || mode == DEBUGTEST ? 2 : 0) if (numaggregators > 10 && !dbg) { fprintf(relay_stdout, "parsed configuration follows:\n" "(%zu aggregations with %zu computations omitted " "for brevity)\n", numaggregators, aggregator_numcomputes(aggrs)); router_printconfig(relay_stdout, 0, clusters, routes); } else { fprintf(relay_stdout, "parsed configuration follows:\n"); router_printconfig(relay_stdout, 1 + dbg, clusters, routes); } fprintf(relay_stdout, "\n"); /* shortcut for rule testing mode */ if (mode == TEST || mode == DEBUGTEST) { char metricbuf[METRIC_BUFSIZ]; char *p; fflush(relay_stdout); while (fgets(metricbuf, sizeof(metricbuf), stdin) != NULL) { if ((p = strchr(metricbuf, '\n')) != NULL) *p = '\0'; router_test(metricbuf, routes); } exit(0); } if (signal(SIGINT, exit_handler) == SIG_ERR) { logerr("failed to create SIGINT handler: %s\n", strerror(errno)); return 1; } if (signal(SIGTERM, exit_handler) == SIG_ERR) { logerr("failed to create SIGTERM handler: %s\n", strerror(errno)); return 1; } if (signal(SIGQUIT, exit_handler) == SIG_ERR) { logerr("failed to create SIGQUIT handler: %s\n", strerror(errno)); return 1; } if (signal(SIGHUP, hup_handler) == SIG_ERR) { logerr("failed to create SIGHUP handler: %s\n", strerror(errno)); return 1; } if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { logerr("failed to ignore SIGPIPE: %s\n", strerror(errno)); return 1; } workers = malloc(sizeof(dispatcher *) * (1 + workercnt + 1)); if (workers == NULL) { logerr("failed to allocate memory for workers\n"); return 1; } if (bindlisten(stream_sock, &stream_socklen, dgram_sock, &dgram_socklen, listeninterface, listenport, listenbacklog) < 0) { logerr("failed to bind on port %s:%d: %s\n", listeninterface == NULL ? "" : listeninterface, listenport, strerror(errno)); return -1; } for (ch = 0; ch < stream_socklen; ch++) { if (dispatch_addlistener(stream_sock[ch]) != 0) { logerr("failed to add listener\n"); return -1; } } for (ch = 0; ch < dgram_socklen; ch++) { if (dispatch_addlistener_udp(dgram_sock[ch]) != 0) { logerr("failed to listen to datagram socket\n"); return -1; } } if ((workers[0] = dispatch_new_listener()) == NULL) logerr("failed to add listener\n"); if (allowed_chars == NULL) allowed_chars = "-_:#"; logout("starting %d workers\n", workercnt); for (id = 1; id < 1 + workercnt; id++) { workers[id + 0] = dispatch_new_connection(routes, allowed_chars); if (workers[id + 0] == NULL) { logerr("failed to add worker %d\n", id); break; } } workers[id + 0] = NULL; if (id < 1 + workercnt) { logerr("shutting down due to errors\n"); keep_running = 0; } /* server used for delivering metrics produced inside the relay, * that is, the collector (statistics) */ if ((internal_submission = server_new("internal", listenport, CON_PIPE, NULL, 3000, batchsize, iotimeout)) == NULL) { logerr("failed to create internal submission queue, shutting down\n"); keep_running = 0; } if (numaggregators > 0) { logout("starting aggregator\n"); if (!aggregator_start(aggrs)) { logerr("shutting down due to failure to start aggregator\n"); keep_running = 0; } } logout("starting statistics collector\n"); collector_start(&workers[1], clusters, aggrs, internal_submission, smode == CUM); logout("startup sequence complete\n"); /* workers do the work, just wait */ while (keep_running) sleep(1); logout("shutting down...\n"); /* make sure we don't accept anything new anymore */ for (ch = 0; ch < stream_socklen; ch++) dispatch_removelistener(stream_sock[ch]); destroy_usock(listenport); logout("closed listeners for port %u\n", listenport); /* since workers will be freed, stop querying the structures */ collector_stop(); server_shutdown(internal_submission); free(internal_submission); logout("stopped collector\n"); if (numaggregators > 0) { aggregator_stop(); logout("stopped aggregator\n"); } /* give a little time for whatever the collector/aggregator wrote, * to be delivered by the dispatchers */ usleep(500 * 1000); /* 500ms */ /* make sure we don't write to our servers any more */ logout("stopped worker"); for (id = 0; id < 1 + workercnt; id++) dispatch_stop(workers[id + 0]); for (id = 0; id < 1 + workercnt; id++) { dispatch_shutdown(workers[id + 0]); fprintf(relay_stdout, " %d", id + 1); fflush(relay_stdout); } fprintf(relay_stdout, "\n"); fflush(relay_stdout); free(workers); router_shutdown(); servers = router_getservers(clusters); for (i = 0; servers[i] != NULL; i++) server_stop(servers[i]); free(servers); router_free(clusters, routes); logout("stopped servers\n"); logout("stopped carbon-c-relay v%s (%s)\n", VERSION, GIT_VERSION); return 0; } carbon-c-relay-1.7/relay.h000066400000000000000000000021011265266732300154520ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef HAVE_RELAY_H #define HAVE_RELAY_H 1 #define VERSION "1.7" #define METRIC_BUFSIZ 8192 enum rmode { NORMAL, DEBUG, SUBMISSION, DEBUGSUBMISSION, TEST, DEBUGTEST }; typedef enum { CON_TCP, CON_UDP, CON_PIPE, CON_FILE } serv_ctype; extern char relay_hostname[]; extern enum rmode mode; enum logdst { LOGOUT, LOGERR }; int relaylog(enum logdst dest, const char *fmt, ...); #define logout(args...) relaylog(LOGOUT, args) #define logerr(args...) relaylog(LOGERR, args) #endif carbon-c-relay-1.7/router.c000066400000000000000000002060741265266732300156700ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include "fnv1a.h" #include "consistent-hash.h" #include "server.h" #include "queue.h" #include "aggregator.h" #include "relay.h" #include "router.h" enum clusttype { BLACKHOLE, /* /dev/null-like destination */ GROUP, /* pseudo type to create a matching tree */ AGGRSTUB, /* pseudo type to have stub matches for aggregation returns */ FORWARD, FILELOG, /* like forward, write metric to file */ FILELOGIP, /* like forward, write ip metric to file */ CARBON_CH, /* original carbon-relay.py consistent-hash */ FNV1A_CH, /* FNV1a-based consistent-hash */ JUMP_CH, /* jump consistent hash with fnv1a input */ ANYOF, /* FNV1a-based hash, but with backup by others */ FAILOVER, /* ordered attempt delivery list */ AGGREGATION, REWRITE }; typedef struct _servers { server *server; struct _servers *next; } servers; typedef struct { unsigned char repl_factor; ch_ring *ring; servers *servers; } chashring; typedef struct { unsigned short count; server **servers; servers *list; } serverlist; struct _cluster { char *name; enum clusttype type; union { chashring *ch; servers *forward; serverlist *anyof; aggregator *aggregation; struct _route *routes; char *replacement; } members; struct _cluster *next; }; typedef struct _destinations { cluster *cl; struct _destinations *next; } destinations; struct _route { char *pattern; /* original regex input, used for printing only */ regex_t rule; /* regex on metric, only if type == REGEX */ size_t nmatch; /* number of match groups */ char *strmatch; /* string to search for if type not REGEX or MATCHALL */ destinations *dests; /* where matches should go */ char stop:1; /* whether to continue matching rules after this one */ enum { MATCHALL, /* the '*', don't do anything, just match everything */ REGEX, /* a regex match */ CONTAINS, /* find string occurrence */ STARTS_WITH, /* metric must start with string */ ENDS_WITH, /* metric must end with string */ MATCHES /* metric matches string exactly */ } matchtype; /* how to interpret the pattern */ struct _route *next; }; static char keep_running = 1; /* custom constant, meant to force regex mode matching */ #define REG_FORCE 01000000 /** * Examines pattern and sets matchtype and rule or strmatch in route. */ static int determine_if_regex(route *r, char *pat, int flags) { /* try and see if we can avoid using a regex match, for * it is simply very slow/expensive to do so: most of * the time, people don't need fancy matching rules */ char patbuf[8192]; char *e = pat; char *pb = patbuf; char escape = 0; r->matchtype = CONTAINS; r->nmatch = 0; if (flags & REG_FORCE) { flags &= ~REG_NOSUB; r->matchtype = REGEX; } if (*e == '^' && r->matchtype == CONTAINS) { e++; r->matchtype = STARTS_WITH; } for (; *e != '\0'; e++) { switch (*e) { case '\\': if (escape) *pb++ = *e; escape = !escape; break; case '.': case '^': case '*': case '+': if (!escape) r->matchtype = REGEX; *pb++ = *e; escape = 0; break; case '$': if (!escape && e[1] == '\0') { if (r->matchtype == STARTS_WITH) { r->matchtype = MATCHES; } else { r->matchtype = ENDS_WITH; } } else { r->matchtype = REGEX; } escape = 0; break; default: if ( !escape && ( (*e == '_') || (*e == '-') || (*e >= '0' && *e <= '9') || (*e >= 'a' && *e <= 'z') || (*e >= 'A' && *e <= 'Z') ) ) { *pb++ = *e; } else { r->matchtype = REGEX; } escape = 0; break; } if (pb - patbuf == sizeof(patbuf)) r->matchtype = REGEX; if (r->matchtype == REGEX) break; } if (r->matchtype != REGEX) { *pb = '\0'; r->strmatch = strdup(patbuf); r->pattern = strdup(pat); } else { int ret = regcomp(&r->rule, pat, flags & ~REG_FORCE); if (ret != 0) return ret; /* allow use of regerror */ r->strmatch = NULL; r->pattern = strdup(pat); if (((flags & REG_NOSUB) == 0 && r->rule.re_nsub > 0) || flags & REG_FORCE) { /* we need +1 because position 0 contains the entire * expression */ r->nmatch = r->rule.re_nsub + 1; if (r->nmatch > RE_MAX_MATCHES) { logerr("determine_if_regex: too many match groups, " "please increase RE_MAX_MATCHES in router.h\n"); free(r->pattern); return REG_ESPACE; /* lie closest to the truth */ } } } return 0; } /** * Populates the routing tables by reading the config file. * * Config file supports the following: * * cluster (name) * (forward | any_of [useall] | failover | (carbon|fnv1a|jump_fnv1a)_ch [replication (count)]) * (ip:port[=instance] [proto (tcp | udp)] ...) * ; * cluster (name) * file [ip] * (/path/to/file ...) * ; * match * (* | regex[ regex ...]) * send to (cluster ... | blackhole) * [stop] * ; * rewrite (regex) * into (replacement) * ; * aggregate * (regex[ regex ...]) * every (interval) seconds * expire after (expiration) seconds * [timestamp at (start | middle | end) of bucket] * compute (sum | count | max | min | average | * median | percentile<%> | variance | stddev) write to * (metric) * [compute ... write to ...] * [send to (cluster ...)] * [stop] * ; * * Comments start with a #-char. * * Example: * * cluster ams4 * carbon_ch replication 2 * 10.0.0.1:2003 * 10.0.0.2:2003 * 10.0.0.3:2003 * ; * match * * send to ams4 * stop; */ int router_readconfig(cluster **clret, route **rret, aggregator **aret, const char *path, size_t queuesize, size_t batchsize, unsigned short iotimeout) { FILE *cnf; char *buf; size_t len = 0; char *p; cluster *cl; cluster *topcl; struct stat st; route *r = NULL; route *topr = NULL; aggregator *a = NULL; aggregator *topa = NULL; struct addrinfo *saddrs; char matchcatchallfound = 0; if (stat(path, &st) == -1) return 0; if ((cnf = fopen(path, "r")) == NULL) return 0; buf = malloc(st.st_size + 1); while ((len = fread(buf + len, 1, st.st_size - len, cnf)) != 0) ; buf[st.st_size] = '\0'; fclose(cnf); /* create virtual blackhole cluster */ cl = malloc(sizeof(cluster)); cl->name = strdup("blackhole"); cl->type = BLACKHOLE; cl->members.forward = NULL; cl->next = NULL; topcl = cl; /* remove all comments to ease parsing below */ p = buf; for (; *p != '\0'; p++) if (*p == '#') for (; *p != '\0' && *p != '\n'; p++) *p = ' '; p = buf; do { for (; *p != '\0' && isspace(*p); p++) ; if (*p == '\0') break; if (strncmp(p, "cluster", 7) == 0 && isspace(*(p + 7))) { /* group config */ servers *w; char *name; char useall = 0; p += 8; for (; *p != '\0' && isspace(*p); p++) ; name = p; for (; *p != '\0' && !isspace(*p); p++) ; if (*p == '\0') { logerr("unexpected end of file after 'cluster'\n"); free(buf); return 0; } *p++ = '\0'; for (; *p != '\0' && isspace(*p); p++) ; if ( (strncmp(p, "carbon_ch", 9) == 0 && isspace(*(p + 9))) || (strncmp(p, "fnv1a_ch", 8) == 0 && isspace(*(p + 8))) || (strncmp(p, "jump_fnv1a_ch", 13) == 0 && isspace(*(p + 13))) ) { int replcnt = 1; enum clusttype chtype = *p == 'c' ? CARBON_CH : *p == 'f' ? FNV1A_CH : JUMP_CH; p += chtype == CARBON_CH ? 10 : chtype == FNV1A_CH ? 9 : 14; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "replication", 11) == 0 && isspace(*(p + 11))) { char *repl; p += 12; for (; *p != '\0' && isspace(*p); p++) ; repl = p; /* parse int */ for (; *p != '\0' && !isspace(*p); p++) ; if (*p == '\0') { logerr("unexpected end of file after " "'replication %s' for cluster %s\n", repl, name); free(buf); return 0; } *p++ = '\0'; if ((replcnt = atoi(repl)) == 0) replcnt = 1; } if ((cl = cl->next = malloc(sizeof(cluster))) == NULL) { logerr("malloc failed in cluster %s\n", name); free(buf); return 0; } cl->type = chtype; cl->members.ch = malloc(sizeof(chashring)); cl->members.ch->repl_factor = (unsigned char)replcnt; cl->members.ch->ring = ch_new(chtype == CARBON_CH ? CARBON : chtype == FNV1A_CH ? FNV1a : JUMP_FNV1a); } else if (strncmp(p, "forward", 7) == 0 && isspace(*(p + 7))) { p += 8; if ((cl = cl->next = malloc(sizeof(cluster))) == NULL) { logerr("malloc failed in cluster forward\n"); free(buf); return 0; } cl->type = FORWARD; cl->members.forward = NULL; } else if (strncmp(p, "any_of", 6) == 0 && isspace(*(p + 6))) { p += 7; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "useall", 6) == 0 && isspace(*(p + 6))) { p += 7; useall = 1; } if ((cl = cl->next = malloc(sizeof(cluster))) == NULL) { logerr("malloc failed in cluster any_of\n"); free(buf); return 0; } cl->type = ANYOF; cl->members.anyof = NULL; } else if (strncmp(p, "failover", 8) == 0 && isspace(*(p + 8))) { p += 9; for (; *p != '\0' && isspace(*p); p++) ; if ((cl = cl->next = malloc(sizeof(cluster))) == NULL) { logerr("malloc failed in cluster failover\n"); free(buf); return 0; } cl->type = FAILOVER; cl->members.anyof = NULL; } else if (strncmp(p, "file", 4) == 0 && isspace(*(p + 4))) { p += 5; for (; *p != '\0' && isspace(*p); p++) ; if ((cl = cl->next = malloc(sizeof(cluster))) == NULL) { logerr("malloc failed in cluster file\n"); free(buf); return 0; } if (strncmp(p, "ip", 2) == 0 && isspace(*(p + 2))) { p += 3; for (; *p != '\0' && isspace(*p); p++) ; cl->type = FILELOGIP; } else { cl->type = FILELOG; } cl->members.forward = NULL; } else { char *type = p; for (; *p != '\0' && !isspace(*p); p++) ; *p = 0; logerr("unknown cluster type '%s' for cluster %s\n", type, name); free(buf); return 0; } /* parse ips */ for (; *p != '\0' && isspace(*p); p++) ; w = NULL; do { char termchr; char *lastcolon = NULL; char *ip = p; char *inst = NULL; char *proto = "tcp"; int port = 2003; server *newserver = NULL; struct addrinfo hint; char sport[8]; int err; struct addrinfo *walk = NULL; struct addrinfo *next = NULL; char hnbuf[256]; for (; *p != '\0' && !isspace(*p) && *p != ';'; p++) { if (*p == ':' && inst == NULL) lastcolon = p; if (*p == '=' && inst == NULL) inst = p; } if (*p == '\0') { logerr("unexpected end of file at '%s' " "for cluster %s\n", ip, name); free(cl); free(buf); return 0; } termchr = *p; *p = '\0'; if (cl->type == CARBON_CH || cl->type == FNV1A_CH || cl->type == JUMP_CH) { if (inst != NULL) { *inst = '\0'; p = inst++; } if (inst == ip) inst = NULL; } if (*(p - 1) == ']') lastcolon = NULL; if (lastcolon != NULL) { char *endp = NULL; *lastcolon = '\0'; port = (int)strtol(lastcolon + 1, &endp, 10); if (port == 0 || endp != p) { logerr("expected port, or unexpected data at " "'%s' for cluster %s\n", lastcolon + 1, name); free(cl); free(buf); return 0; } } if (*ip == '[') { ip++; if (lastcolon != NULL && *(lastcolon - 1) == ']') { *(lastcolon - 1) = '\0'; } else if (lastcolon == NULL && *(p - 1) == ']') { *(p - 1) = '\0'; } else { logerr("expected ']' at '%s' " "for cluster %s\n", ip, name); free(cl); free(buf); return 0; } } if (inst != NULL) p = inst + strlen(inst); if (isspace(termchr)) { p++; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "proto", 5) == 0 && isspace(*(p + 5))) { p += 6; for (; *p != '\0' && isspace(*p); p++) ; proto = p; for (; *p != '\0' && !isspace(*p) && *p != ';'; p++) ; termchr = *p; *p = '\0'; if (strcmp(proto, "tcp") != 0 && strcmp(proto, "udp") != 0) { logerr("expected 'udp' or 'tcp' after " "'proto' at '%s' for cluster %s\n", proto, name); free(cl); free(buf); return 0; } } else { termchr = *p; } } if (cl->type != FILELOG && cl->type != FILELOGIP) { /* resolve host/IP */ memset(&hint, 0, sizeof(hint)); hint.ai_family = PF_UNSPEC; hint.ai_socktype = *proto == 'u' ? SOCK_DGRAM : SOCK_STREAM; hint.ai_protocol = *proto == 'u' ? IPPROTO_UDP : IPPROTO_TCP; hint.ai_flags = AI_NUMERICSERV; snprintf(sport, sizeof(sport), "%u", port); /* for default */ if ((err = getaddrinfo(ip, sport, &hint, &saddrs)) != 0) { logerr("failed to resolve server %s:%s (%s) " "for cluster %s: %s\n", ip, sport, proto, name, gai_strerror(err)); free(cl); free(buf); return 0; } if (!useall && saddrs->ai_next != NULL) { /* take first result only */ freeaddrinfo(saddrs->ai_next); saddrs->ai_next = NULL; } } else { /* TODO: try to create/append to file */ proto = "file"; saddrs = (void *)1; } walk = saddrs; while (walk != NULL) { /* disconnect from the rest to avoid double * frees by freeaddrinfo() in server_destroy() */ if (walk != (void *)1) { next = walk->ai_next; walk->ai_next = NULL; } if (useall) { /* unfold whatever we resolved, for human * readability issues */ if (walk->ai_family == AF_INET) { if (inet_ntop(walk->ai_family, &((struct sockaddr_in *)walk->ai_addr)->sin_addr, hnbuf, sizeof(hnbuf)) != NULL) ip = hnbuf; } else if (walk->ai_family == AF_INET6) { if (inet_ntop(walk->ai_family, &((struct sockaddr_in6 *)walk->ai_addr)->sin6_addr, hnbuf + 1, sizeof(hnbuf) - 2) != NULL) { hnbuf[0] = '['; /* space is reserved above */ strcat(hnbuf, "]"); ip = hnbuf; } } } newserver = server_new(ip, (unsigned short)port, *proto == 'f' ? CON_FILE : *proto == 'u' ? CON_UDP : CON_TCP, walk == (void *)1 ? NULL : walk, queuesize, batchsize, iotimeout); if (newserver == NULL) { logerr("failed to add server %s:%d (%s) " "to cluster %s: %s\n", ip, port, proto, name, strerror(errno)); free(cl); free(buf); return 0; } if (cl->type == CARBON_CH || cl->type == FNV1A_CH || cl->type == JUMP_CH) { if (w == NULL) { cl->members.ch->servers = w = malloc(sizeof(servers)); } else { w = w->next = malloc(sizeof(servers)); } if (w == NULL) { logerr("malloc failed for %s_ch %s\n", cl->type == CARBON_CH ? "carbon" : cl->type == FNV1A_CH ? "fnv1a" : "jump_fnv1a", ip); free(cl); free(buf); return 0; } w->next = NULL; if (inst != NULL) server_set_instance(newserver, inst); w->server = newserver; cl->members.ch->ring = ch_addnode( cl->members.ch->ring, w->server); if (cl->members.ch->ring == NULL) { logerr("failed to add server %s:%d " "to ring for cluster %s: out of memory\n", ip, port, name); free(cl); free(buf); return 0; } } else if (cl->type == FORWARD || cl->type == ANYOF || cl->type == FAILOVER || cl->type == FILELOG || cl->type == FILELOGIP) { if (w == NULL) { w = malloc(sizeof(servers)); } else { w = w->next = malloc(sizeof(servers)); } if (w == NULL) { logerr("malloc failed for %s %s\n", cl->type == FORWARD ? "forward" : cl->type == ANYOF ? "any_of" : cl->type == FAILOVER ? "failover" : "file", ip); free(cl); free(buf); return 0; } w->next = NULL; w->server = newserver; if ((cl->type == FORWARD || cl->type == FILELOG || cl->type == FILELOGIP) && cl->members.forward == NULL) cl->members.forward = w; if (cl->type == ANYOF || cl->type == FAILOVER) { if (cl->members.anyof == NULL) { cl->members.anyof = malloc(sizeof(serverlist)); cl->members.anyof->count = 1; cl->members.anyof->servers = NULL; cl->members.anyof->list = w; } else { cl->members.anyof->count++; } } } walk = next; } *p = termchr; for (; *p != '\0' && isspace(*p); p++) ; } while (*p != ';'); p++; /* skip over ';' */ if (cl->type == ANYOF || cl->type == FAILOVER) { size_t i = 0; cl->members.anyof->servers = malloc(sizeof(server *) * cl->members.anyof->count); for (w = cl->members.anyof->list; w != NULL; w = w->next) cl->members.anyof->servers[i++] = w->server; for (w = cl->members.anyof->list; w != NULL; w = w->next) { server_add_secondaries(w->server, cl->members.anyof->servers, cl->members.anyof->count); if (cl->type == FAILOVER) server_set_failover(w->server); } } else if (cl->type == CARBON_CH || cl->type == FNV1A_CH || cl->type == JUMP_CH) { /* check that replication count is actually <= the * number of servers */ size_t i = 0; for (w = cl->members.ch->servers; w != NULL; w = w->next) i++; if (i < cl->members.ch->repl_factor) { logerr("invalid cluster '%s': replication count (%zu) is " "larger than the number of servers (%zu)\n", name, cl->members.ch->repl_factor, i); free(cl); free(buf); return 0; } } cl->name = strdup(name); cl->next = NULL; } else if (strncmp(p, "match", 5) == 0 && isspace(*(p + 5))) { /* match rule */ char *pat; char *dest; char stop = -1; cluster *w; route *m = NULL; destinations *d = NULL; destinations *dw = NULL; #define FREE_R \ { \ route *lm; \ do { \ lm = m->next; \ free(m); \ } while (m != r && (m = lm) != NULL); \ } #define FREE_D \ { \ destinations *ld; \ while (dw != NULL) { \ ld = dw->next; \ free(dw); \ dw = ld; \ } \ } p += 6; for (; *p != '\0' && isspace(*p); p++) ; do { pat = p; for (; *p != '\0' && !isspace(*p); p++) ; if (*p == '\0') { logerr("unexpected end of file after 'match'\n"); FREE_R; free(buf); return 0; } *p++ = '\0'; for (; *p != '\0' && isspace(*p); p++) ; if (r == NULL) { topr = r = malloc(sizeof(route)); } else { r = r->next = malloc(sizeof(route)); } if (m == NULL) m = r; if (strcmp(pat, "*") == 0) { r->pattern = NULL; r->strmatch = NULL; r->matchtype = MATCHALL; } else { int err = determine_if_regex(r, pat, REG_EXTENDED | REG_NOSUB); if (err != 0) { char ebuf[512]; regerror(err, &r->rule, ebuf, sizeof(ebuf)); logerr("invalid expression '%s' for match: %s\n", pat, ebuf); FREE_R; free(buf); return 0; } } r->next = NULL; } while (strncmp(p, "send", 4) != 0 || !isspace(*(p + 4))); p += 5; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "to", 2) != 0 || !isspace(*(p + 2))) { logerr("expected 'send to' after match %s\n", pat); FREE_R; free(buf); return 0; } p += 3; for (; *p != '\0' && isspace(*p); p++) ; do { char save; dest = p; for (; *p != '\0' && !isspace(*p) && *p != ';'; p++) ; if (*p == '\0') break; save = *p; *p = '\0'; /* lookup dest */ for (w = topcl; w != NULL; w = w->next) { if (w->type != GROUP && w->type != AGGRSTUB && w->type != AGGREGATION && w->type != REWRITE && strcmp(w->name, dest) == 0) break; } if (w == NULL) { logerr("no such cluster '%s' for 'match %s'\n", dest, pat); FREE_R; FREE_D; free(buf); return 0; } if (dw == NULL) { dw = d = malloc(sizeof(destinations)); } else { d = d->next = malloc(sizeof(destinations)); } if (d == NULL) { logerr("out of memory allocating new destination '%s' " "for 'match %s'\n", dest, pat); FREE_R; FREE_D; free(buf); return 0; } d->cl = w; d->next = NULL; *p = save; for (; *p != '\0' && isspace(*p); p++) ; } while (*p != ';' && !(strncmp(p, "stop", 4) == 0 && (isspace(*(p + 4)) || *(p + 4) == ';'))); if (*p == '\0') { logerr("unexpected end of file after 'send to %s'\n", dest); FREE_R; FREE_D; free(buf); return 0; } else if (*p == ';') { stop = 0; } else { /* due to strncmp above, this must be stop */ p += 4; stop = 1; } /* find the closing ';' */ for (; *p != '\0' && isspace(*p); p++) ; if (*p != ';') { logerr("expected ';' after match %s\n", pat); FREE_R; FREE_D; free(buf); return 0; } p++; /* fill in the destinations for all the routes */ do { m->dests = dw; m->stop = w->type == BLACKHOLE ? 1 : stop; } while (m != r && (m = m->next) != NULL); if (matchcatchallfound) { logerr("warning: match %s will never be matched " "due to preceding match * ... stop\n", r->pattern == NULL ? "*" : r->pattern); } if (r->matchtype == MATCHALL && r->stop) matchcatchallfound = 1; } else if (strncmp(p, "aggregate", 9) == 0 && isspace(*(p + 9))) { /* aggregation rule */ char *type; char *pat; char *num; enum _aggr_timestamp tswhen = TS_END; cluster *w; int err; int intv; int exp; char stop = 0; route *m = NULL; destinations *d = NULL; destinations *dw = NULL; p += 10; for (; *p != '\0' && isspace(*p); p++) ; w = malloc(sizeof(cluster)); w->name = NULL; w->type = AGGREGATION; w->next = NULL; do { pat = p; for (; *p != '\0' && !isspace(*p); p++) ; if (*p == '\0') { logerr("unexpected end of file after 'aggregate'\n"); free(buf); return 0; } *p++ = '\0'; for (; *p != '\0' && isspace(*p); p++) ; if (r == NULL) { topr = r = malloc(sizeof(route)); } else { r = r->next = malloc(sizeof(route)); } if (m == NULL) m = r; err = determine_if_regex(r, pat, REG_EXTENDED); if (err != 0) { char ebuf[512]; regerror(err, &r->rule, ebuf, sizeof(ebuf)); logerr("invalid expression '%s' " "for aggregation: %s\n", pat, ebuf); FREE_R; free(w); free(buf); return 0; } r->next = NULL; } while (strncmp(p, "every", 5) != 0 || !isspace(*(p + 5))); p += 6; for (; *p != '\0' && isspace(*p); p++) ; num = p; for (; *p != '\0' && isdigit(*p); p++) ; if (*p == '\0') { logerr("unexpected end of file after 'every %s'\n", num); FREE_R; free(w); free(buf); return 0; } if (!isspace(*p)) { logerr("unexpected character '%c', " "expected number after 'every'\n", *p); FREE_R; free(w); free(buf); return 0; } *p++ = '\0'; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "seconds", 7) != 0 || !isspace(*(p + 7))) { logerr("expected 'seconds' after 'every %s'\n", num); FREE_R; free(w); free(buf); return 0; } p += 8; intv = atoi(num); if (intv == 0) { logerr("interval must be non-zero\n"); FREE_R; free(w); free(buf); return 0; } for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "expire", 6) != 0 || !isspace(*(p + 6))) { logerr("expected 'expire after' after 'every %s seconds\n", num); FREE_R; free(w); free(buf); return 0; } p += 7; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "after", 5) != 0 || !isspace(*(p + 5))) { logerr("expected 'after' after 'expire'\n"); FREE_R; free(w); free(buf); return 0; } p += 6; for (; *p != '\0' && isspace(*p); p++) ; num = p; for (; *p != '\0' && isdigit(*p); p++) ; if (*p == '\0') { logerr("unexpected end of file after 'expire after %s'\n", num); FREE_R; free(w); free(buf); return 0; } if (!isspace(*p)) { logerr("unexpected character '%c', " "expected number after 'expire after'\n", *p); FREE_R; free(w); free(buf); return 0; } *p++ = '\0'; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "seconds", 7) != 0 || !isspace(*(p + 7))) { logerr("expected 'seconds' after 'expire after %s'\n", num); FREE_R; free(w); free(buf); return 0; } p += 8; exp = atoi(num); if (exp == 0) { logerr("expire must be non-zero\n"); FREE_R; free(w); free(buf); return 0; } if (exp < intv) { logerr("expire (%d) must be greater than interval (%d)\n", exp, intv); FREE_R; free(w); free(buf); return 0; } for (; *p != '\0' && isspace(*p); p++) ; /* optional timestamp bit */ if (strncmp(p, "timestamp", 9) == 0 && isspace(*(p + 9))) { p += 10; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "at", 2) != 0 || !isspace(*(p + 2))) { logerr("expected 'at' after 'timestamp'\n"); FREE_R; free(w); free(buf); return 0; } p += 3; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "start", 5) == 0 && isspace(*(p + 5))) { p += 6; tswhen = TS_START; } else if (strncmp(p, "middle", 6) == 0 && isspace(*(p + 6))) { p += 7; tswhen = TS_MIDDLE; } else if (strncmp(p, "end", 3) == 0 && isspace(*(p + 3))) { p += 4; tswhen = TS_END; } else { logerr("expected 'start', 'middle' or 'end' " "after 'timestamp at'\n"); FREE_R; free(w); free(buf); return 0; } for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "of", 2) != 0 || !isspace(*(p + 2))) { logerr("expected 'of' after 'timestamp at ...'\n"); FREE_R; free(w); free(buf); return 0; } p += 3; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "bucket", 6) != 0 || !isspace(*(p + 6))) { logerr("expected 'bucket' after 'timestamp at ... of'\n"); FREE_R; free(w); free(buf); return 0; } p += 7; for (; *p != '\0' && isspace(*p); p++) ; } w->members.aggregation = aggregator_new(intv, exp, tswhen); if (w->members.aggregation == NULL) { logerr("out of memory while allocating new aggregator\n"); FREE_R; free(w); free(buf); return 0; } if (a == NULL) { topa = a = w->members.aggregation; } else { a = a->next = w->members.aggregation; } do { if (strncmp(p, "compute", 7) != 0 || !isspace(*(p + 7))) { logerr("expected 'compute' at: %.20s\n", p); FREE_R; free(w); free(buf); return 0; } p += 8; pat = p; for (; *p != '\0' && !isspace(*p); p++) ; if (*p == '\0') { logerr("unexpected end of file after 'compute'\n"); FREE_R; free(w); free(buf); return 0; } *p++ = '\0'; type = pat; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "write", 5) != 0 || !isspace(*(p + 5))) { logerr("expected 'write to' after 'compute %s'\n", pat); FREE_R; free(w); free(buf); return 0; } p += 6; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "to", 2) != 0 || !isspace(*(p + 2))) { logerr("expected 'write to' after 'compute %s'\n", pat); FREE_R; free(w); free(buf); return 0; } p += 3; for (; *p != '\0' && isspace(*p); p++) ; pat = p; for (; *p != '\0' && !isspace(*p); p++) ; if (*p == '\0') { logerr("unexpected end of file after 'write to'\n"); FREE_R; free(w); free(buf); return 0; } *p++ = '\0'; if (aggregator_add_compute(w->members.aggregation, pat, type) != 0) { logerr("expected sum, count, max, min, average, " "median, percentile<%>, variance or stddev " "after 'compute', got '%s'\n", type); FREE_R; free(w); free(buf); return 0; } for (; *p != '\0' && isspace(*p); p++) ; } while (*p != ';' && !(strncmp(p, "send", 4) == 0 && isspace(*(p + 4))) && !(strncmp(p, "stop", 4) == 0 && (isspace(*(p + 4)) || *(p + 4) == ';'))); if (strncmp(p, "send", 4) == 0 && isspace(*(p + 4))) { cluster *cw; p += 5; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "to", 2) != 0 || !isspace(*(p + 2))) { logerr("expected 'to' after 'send' for aggregate\n"); free(buf); return 0; } p += 3; for (; *p != '\0' && isspace(*p); p++) ; do { char save; char *dest = p; for (; *p != '\0' && !isspace(*p) && *p != ';'; p++) ; if (*p == '\0') break; save = *p; *p = '\0'; /* lookup dest */ for (cw = topcl; cw != NULL; cw = cw->next) { if (cw->type != GROUP && cw->type != AGGRSTUB && cw->type != AGGREGATION && cw->type != REWRITE && strcmp(cw->name, dest) == 0) break; } if (cw == NULL) { logerr("no such cluster '%s' for aggregate\n", dest); FREE_D; FREE_R; free(w); free(buf); return 0; } if (dw == NULL) { dw = d = malloc(sizeof(destinations)); } else { d = d->next = malloc(sizeof(destinations)); } if (d == NULL) { logerr("out of memory allocating new destination '%s' " "for aggregation\n", dest); FREE_D; FREE_R; free(w); free(buf); return 0; } d->cl = cw; d->next = NULL; *p = save; for (; *p != '\0' && isspace(*p); p++) ; } while (*p != ';' && !(strncmp(p, "stop", 4) == 0 && (isspace(*(p + 4)) || *(p + 4) == ';'))); } if (strncmp(p, "stop", 4) == 0 && (isspace(*(p + 4)) || *(p + 4) == ';')) { p += 4; stop = 1; for (; *p != '\0' && isspace(*p); p++) ; } if (*p == '\0') { logerr("unexpected end of file after 'aggregate %s'\n", r->pattern); FREE_D; FREE_R; free(w); free(buf); return 0; } if (*p != ';') { logerr("expected ';' after aggregate %s\n", r->pattern); FREE_D; FREE_R; free(w); free(buf); return 0; } p++; /* add cluster to the list of clusters */ cl = cl->next = w; /* fill in the destinations for all the routes, this is for * printing and free-ing */ do { m->dests = malloc(sizeof(destinations)); m->dests->cl = w; m->dests->next = dw; m->stop = stop; } while (m != r && (m = m->next) != NULL); if (dw != NULL) { char stubname[48]; m = malloc(sizeof(route)); m->pattern = NULL; m->strmatch = NULL; m->dests = dw; m->stop = 1; m->matchtype = MATCHALL; m->next = NULL; /* inject stub route for dests */ d = malloc(sizeof(destinations)); cl = cl->next = d->cl = malloc(sizeof(cluster)); cl->name = NULL; cl->type = AGGRSTUB; cl->members.routes = m; cl->next = NULL; d->next = NULL; snprintf(stubname, sizeof(stubname), "_stub_aggregator_%p__", w->members.aggregation); m = malloc(sizeof(route)); m->pattern = strdup(stubname); m->strmatch = strdup(stubname); m->dests = d; m->stop = 1; m->matchtype = STARTS_WITH; m->next = topr; /* enforce first match to avoid interference */ topr = m; aggregator_set_stub(w->members.aggregation, stubname); } if (matchcatchallfound) { logerr("warning: aggregate %s will never be matched " "due to preceeding match * ... stop\n", r->pattern); } } else if (strncmp(p, "rewrite", 7) == 0 && isspace(*(p + 7))) { /* rewrite rule */ char *pat; char *replacement; int err; p += 8; for (; *p != '\0' && isspace(*p); p++) ; pat = p; for (; *p != '\0' && !isspace(*p); p++) ; if (*p == '\0') { logerr("unexpected end of file after 'rewrite'\n"); free(buf); return 0; } *p++ = '\0'; for (; *p != '\0' && isspace(*p); p++) ; if (strncmp(p, "into", 4) != 0 || !isspace(*(p + 4))) { logerr("expected 'into' after rewrite %s\n", pat); free(buf); return 0; } p += 5; for (; *p != '\0' && isspace(*p); p++) ; replacement = p; for (; *p != '\0' && !isspace(*p) && *p != ';'; p++) ; if (*p == '\0') { logerr("unexpected end of file after 'into %s'\n", replacement); free(buf); return 0; } else if (*p == ';') { *p++ = '\0'; } else { *p++ = '\0'; for (; *p != '\0' && isspace(*p) && *p != ';'; p++) ; if (*p != ';') { logerr("expected ';' after %s\n", replacement); free(buf); return 0; } p++; } if (r == NULL) { topr = r = malloc(sizeof(route)); } else { r = r->next = malloc(sizeof(route)); } err = determine_if_regex(r, pat, REG_EXTENDED | REG_FORCE); if (err != 0) { char ebuf[512]; regerror(err, &r->rule, ebuf, sizeof(ebuf)); logerr("invalid expression '%s' for rewrite: %s\n", pat, ebuf); free(r); free(buf); return 0; } if ((cl = cl->next = malloc(sizeof(cluster))) == NULL) { logerr("malloc failed for rewrite destination\n"); free(r); free(buf); return 0; } cl->type = REWRITE; cl->name = NULL; cl->members.replacement = strdup(replacement); cl->next = NULL; r->dests = malloc(sizeof(destinations)); r->dests->cl = cl; r->dests->next = NULL; r->stop = 0; r->next = NULL; if (matchcatchallfound) { logerr("warning: rewrite %s will never be matched " "due to preceeding match * ... stop\n", r->pattern == NULL ? "*" : r->pattern); } } else { /* garbage? */ logerr("garbage in config: %s\n", p); free(buf); router_free(topcl, topr); return 0; } } while (*p != '\0'); free(buf); *clret = topcl; *rret = topr; *aret = topa; return 1; } typedef struct _block { char *pattern; size_t hash; char prio; size_t refcnt; size_t seqnr; route *firstroute; route *lastroute; struct _block *prev; struct _block *next; } block; /** * Tries to optimise the match and aggregation rules in such a way that * the number of matches for non-matching metrics are reduced. The * problem is that with many metrics flowing in, the time to perform * lots of regex matches is high. This is not too bad if that time * spent actually results in a metric being counted (aggregation) or * sent further (match), but it is when the metric would be discarded * for it did not match anything. * Hence, we employ a simple strategy to try and reduce the incoming * stream of metrics as soon as possible before performing the more * specific and expensive matches to confirm fit. */ void router_optimise(route **routes) { char *p; char pblock[64]; char *b; route *rwalk; route *rnext; block *blocks; block *bwalk; block *bstart; block *blast; size_t bsum; size_t seq; /* avoid optimising anything if it won't pay off */ seq = 0; for (rwalk = *routes; rwalk != NULL && seq < 50; rwalk = rwalk->next) seq++; if (seq < 50) return; /* Heuristic: the last part of the matching regex is the most * discriminating part of the metric. The last part is defined as a * block of characters matching [a-zA-Z_]+ at the end disregarding * any characters not matched by the previous expression. Then from * these last parts we create groups, that -- if having enough * members -- is used to reduce the amount of comparisons done * before determining that an input metric cannot match any * expression we have defined. */ seq = 0; blast = bstart = blocks = malloc(sizeof(block)); blocks->refcnt = 0; blocks->seqnr = seq++; blocks->prev = NULL; blocks->next = NULL; for (rwalk = *routes; rwalk != NULL; rwalk = rnext) { /* matchall rules cannot be in a group */ if (rwalk->matchtype == MATCHALL) { blast->next = malloc(sizeof(block)); blast->next->prev = blast; blast = blast->next; blast->pattern = NULL; blast->hash = 0; blast->prio = 1; blast->refcnt = 1; blast->seqnr = seq++; blast->firstroute = rwalk; blast->lastroute = rwalk; blast->next = NULL; rnext = rwalk->next; rwalk->next = NULL; bstart = blast; continue; } p = rwalk->pattern + strlen(rwalk->pattern); /* strip off chars that won't belong to a block */ while ( p > rwalk->pattern && (*p < 'a' || *p > 'z') && (*p < 'A' || *p > 'Z') && *p != '_' ) p--; if (p == rwalk->pattern) { /* nothing we can do with a pattern like this */ blast->next = malloc(sizeof(block)); blast->next->prev = blast; blast = blast->next; blast->pattern = NULL; blast->hash = 0; blast->prio = rwalk->stop ? 1 : 2; blast->refcnt = 1; blast->seqnr = seq; blast->firstroute = rwalk; blast->lastroute = rwalk; blast->next = NULL; rnext = rwalk->next; rwalk->next = NULL; if (rwalk->stop) { bstart = blast; seq++; } continue; } /* find the block */ bsum = 0; b = pblock; while ( p > rwalk->pattern && b - pblock < sizeof(pblock) && ( (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || *p == '_' ) ) { bsum += *p; *b++ = *p--; } *b = '\0'; b = pblock; if (strlen(b) < 3) { /* this probably isn't selective enough */ blast->next = malloc(sizeof(block)); blast->next->prev = blast; blast = blast->next; blast->pattern = NULL; blast->hash = 0; blast->prio = rwalk->stop ? 1 : 2; blast->refcnt = 1; blast->seqnr = seq; blast->firstroute = rwalk; blast->lastroute = rwalk; blast->next = NULL; rnext = rwalk->next; rwalk->next = NULL; if (rwalk->stop) { bstart = blast; seq++; } continue; } /* at this point, b points to the tail block in reverse, see if * we already had such tail in place */ for (bwalk = bstart->next; bwalk != NULL; bwalk = bwalk->next) { if (bwalk->hash != bsum || strcmp(bwalk->pattern, b) != 0) continue; break; } if (bwalk == NULL) { blast->next = malloc(sizeof(block)); blast->next->prev = blast; blast = blast->next; blast->pattern = strdup(b); blast->hash = bsum; blast->prio = rwalk->stop ? 1 : 2; blast->refcnt = 1; blast->seqnr = seq; blast->firstroute = rwalk; blast->lastroute = rwalk; blast->next = NULL; rnext = rwalk->next; rwalk->next = NULL; if (rwalk->stop) { bstart = blast; seq++; } continue; } bwalk->refcnt++; bwalk->lastroute = bwalk->lastroute->next = rwalk; rnext = rwalk->next; rwalk->next = NULL; if (rwalk->stop) { /* move this one to the end */ if (bwalk->next != NULL) { bwalk->prev->next = bwalk->next; blast = blast->next = bwalk; bwalk->next = NULL; } bwalk->prio = 1; bstart = blast; seq++; } } /* make loop below easier by appending a dummy (reuse the one from * start) */ blast = blast->next = blocks; blocks = blocks->next; blast->next = NULL; blast->seqnr = seq; rwalk = *routes = NULL; seq = 1; bstart = NULL; /* create groups, if feasible */ for (bwalk = blocks; bwalk != NULL; bwalk = blast) { if (bwalk->seqnr != seq) { seq++; if (bstart != NULL) { bstart->next = bwalk; bwalk = bstart; } else { blast = bwalk; continue; } } else if (bwalk->prio == 1) { bstart = bwalk; blast = bwalk->next; bstart->next = NULL; continue; } if (bwalk->refcnt == 0) { blast = bwalk->next; free(bwalk); continue; } else if (bwalk->refcnt < 3) { if (*routes == NULL) { *routes = bwalk->firstroute; } else { rwalk->next = bwalk->firstroute; } rwalk = bwalk->lastroute; blast = bwalk->next; free(bwalk->pattern); free(bwalk); } else { if (*routes == NULL) { rwalk = *routes = malloc(sizeof(route)); } else { rwalk = rwalk->next = malloc(sizeof(route)); } rwalk->pattern = NULL; rwalk->stop = 0; rwalk->matchtype = CONTAINS; rwalk->dests = malloc(sizeof(destinations)); rwalk->dests->cl = malloc(sizeof(cluster)); rwalk->dests->cl->name = bwalk->pattern; rwalk->dests->cl->type = GROUP; rwalk->dests->cl->members.routes = bwalk->firstroute; rwalk->dests->cl->next = NULL; rwalk->dests->next = NULL; rwalk->next = NULL; blast = bwalk->next; free(bwalk); } if (bwalk == bstart) bstart = NULL; } } /** * Returns all (unique) servers from the cluster-configuration. */ server ** router_getservers(cluster *clusters) { #define SERVSZ 511 server **ret = malloc(sizeof(server *) * SERVSZ + 1); cluster *c; servers *s; int i; *ret = NULL; #define add_server(X) { \ for (i = 0; i < SERVSZ && ret[i] != NULL; i++) \ if (ret[i] == X) \ break; \ if (i < SERVSZ && ret[i] == NULL) { \ ret[i] = X; \ ret[i + 1] = NULL; \ } \ } for (c = clusters; c != NULL; c = c->next) { if (c->type == FORWARD || c->type == FILELOG || c->type == FILELOGIP) { for (s = c->members.forward; s != NULL; s = s->next) add_server(s->server); } else if (c->type == ANYOF || c->type == FAILOVER) { for (s = c->members.anyof->list; s != NULL; s = s->next) add_server(s->server); } else if (c->type == CARBON_CH || c->type == FNV1A_CH || c->type == JUMP_CH) { for (s = c->members.ch->servers; s != NULL; s = s->next) add_server(s->server); } } return ret; } /** * Mere debugging function to check if the configuration is picked up * alright. If all is set to false, aggregation rules won't be printed. * This comes in handy because aggregations usually come in the order of * thousands. */ void router_printconfig(FILE *f, char mode, cluster *clusters, route *routes) { cluster *c; route *r; servers *s; #define PPROTO \ server_ctype(s->server) == CON_UDP ? " proto udp" : "" for (c = clusters; c != NULL; c = c->next) { if (c->type == BLACKHOLE || c->type == REWRITE || c->type == GROUP || c->type == AGGREGATION || c->type == AGGRSTUB) continue; fprintf(f, "cluster %s\n", c->name); if (c->type == FORWARD) { fprintf(f, " forward\n"); for (s = c->members.forward; s != NULL; s = s->next) fprintf(f, " %s:%d%s\n", server_ip(s->server), server_port(s->server), PPROTO); } else if (c->type == FILELOG || c->type == FILELOGIP) { fprintf(f, " file%s\n", c->type == FILELOGIP ? " ip" : ""); for (s = c->members.forward; s != NULL; s = s->next) fprintf(f, " %s\n", server_ip(s->server)); } else if (c->type == ANYOF || c->type == FAILOVER) { fprintf(f, " %s\n", c->type == ANYOF ? "any_of" : "failover"); for (s = c->members.anyof->list; s != NULL; s = s->next) fprintf(f, " %s:%d%s\n", server_ip(s->server), server_port(s->server), PPROTO); } else if (c->type == CARBON_CH || c->type == FNV1A_CH || c->type == JUMP_CH) { fprintf(f, " %s_ch replication %d\n", c->type == CARBON_CH ? "carbon" : c->type == FNV1A_CH ? "fnv1a" : "jump_fnv1a", c->members.ch->repl_factor); for (s = c->members.ch->servers; s != NULL; s = s->next) fprintf(f, " %s:%d%s%s%s\n", server_ip(s->server), server_port(s->server), server_instance(s->server) ? "=" : "", server_instance(s->server) ? server_instance(s->server) : "", PPROTO); } fprintf(f, " ;\n"); if (mode & 2) { if (c->type == CARBON_CH || c->type == FNV1A_CH || c->type == JUMP_CH) { fprintf(f, "# hash ring for %s follows\n", c->name); ch_printhashring(c->members.ch->ring, f); } } } fprintf(f, "\n"); for (r = routes; r != NULL; r = r->next) { if (r->dests->cl->type == AGGREGATION) { cluster *aggr = r->dests->cl; struct _aggr_computes *ac; char stubname[48]; char percentile[16]; if (!(mode & 1)) continue; if (mode & 2 || r->dests->next == NULL) { stubname[0] = '\0'; } else { snprintf(stubname, sizeof(stubname), "_stub_aggregator_%p__", aggr->members.aggregation); } fprintf(f, "aggregate"); if (r->next == NULL || r->next->dests->cl != aggr) { fprintf(f, " %s\n", r->pattern); } else { fprintf(f, "\n"); do { fprintf(f, " %s\n", r->pattern); } while (r->next != NULL && r->next->dests->cl == aggr && (r = r->next) != NULL); } fprintf(f, " every %u seconds\n" " expire after %u seconds\n" " timestamp at %s of bucket\n", aggr->members.aggregation->interval, aggr->members.aggregation->expire, aggr->members.aggregation->tswhen == TS_START ? "start" : aggr->members.aggregation->tswhen == TS_MIDDLE ? "middle" : aggr->members.aggregation->tswhen == TS_END ? "end" : ""); for (ac = aggr->members.aggregation->computes; ac != NULL; ac = ac->next) { snprintf(percentile, sizeof(percentile), "percentile%d", ac->percentile); fprintf(f, " compute %s write to\n" " %s\n", ac->type == SUM ? "sum" : ac->type == CNT ? "count" : ac->type == MAX ? "max" : ac->type == MIN ? "min" : ac->type == AVG ? "average" : ac->type == MEDN ? "median" : ac->type == PCTL ? percentile : ac->type == VAR ? "variance" : ac->type == SDEV ? "stddev" : "", ac->metric + strlen(stubname)); } if (!(mode & 2) && r->dests->next != NULL) { destinations *dn = r->dests->next; fprintf(f, " send to"); if (dn->next == NULL) { fprintf(f, " %s\n", dn->cl->name); } else { for (; dn != NULL; dn = dn->next) fprintf(f, "\n %s", dn->cl->name); fprintf(f, "\n"); } } fprintf(f, "%s ;\n", r->stop ? " stop\n" : ""); } else if (r->dests->cl->type == REWRITE) { fprintf(f, "rewrite %s\n into %s\n ;\n", r->pattern, r->dests->cl->members.replacement); } else if (r->dests->cl->type == GROUP) { size_t cnt = 0; route *rwalk; char blockname[64]; char *b = &blockname[sizeof(blockname) - 1]; char *p; for (rwalk = r->dests->cl->members.routes; rwalk != NULL; rwalk = rwalk->next) cnt++; /* reverse the name, to make it human consumable */ *b-- ='\0'; for (p = r->dests->cl->name; *p != '\0' && b > blockname; p++) *b-- = *p; fprintf(f, "# common pattern group '%s' " "contains %zu aggregations/matches\n", ++b, cnt); } else if (r->dests->cl->type == AGGRSTUB) { if (mode & 2) { fprintf(f, "# stub match for aggregate rule with send to\n"); fprintf(f, "match ^%s\n send to", r->pattern); if (r->dests->cl->members.routes->dests->next == NULL) { fprintf(f, " %s", r->dests->cl->members.routes->dests->cl->name); } else { destinations *d = r->dests->cl->members.routes->dests; for (; d != NULL; d = d->next) fprintf(f, "\n %s", d->cl->name); } fprintf(f, "%s\n ;\n", r->stop ? "\n stop" : ""); } } else { route *or = r; fprintf(f, "match"); if (r->next == NULL || r->next->dests != or->dests) { fprintf(f, " %s\n", r->matchtype == MATCHALL ? "*" : r->pattern); } else { fprintf(f, "\n"); do { fprintf(f, " %s\n", r->matchtype == MATCHALL ? "*" : r->pattern); } while (r->next != NULL && r->next->dests == or->dests && (r = r->next) != NULL); } fprintf(f, " send to"); if (or->dests->next == NULL) { fprintf(f, " %s", or->dests->cl->name); } else { destinations *d; for (d = or->dests; d != NULL; d = d->next) fprintf(f, "\n %s", d->cl->name); } fprintf(f, "\n%s ;\n", or->stop ? " stop\n" : ""); } } fflush(f); } /** * Free the routes and all associated resources. */ void router_free(cluster *clusters, route *routes) { cluster *c; route *r; servers *s; destinations *d; while (routes != NULL) { if (routes->pattern) free(routes->pattern); if (routes->strmatch) free(routes->strmatch); if (routes->matchtype == REGEX) regfree(&routes->rule); if (routes->next == NULL || routes->next->dests != routes->dests) { while (routes->dests != NULL) { if (routes->dests->cl->type == GROUP || routes->dests->cl->type == AGGRSTUB) router_free(NULL, routes->dests->cl->members.routes); /* avoid freeing pointer also in use by stub */ if (routes->dests->cl->type == AGGREGATION) { d = NULL; } else { d = routes->dests->next; } free(routes->dests); routes->dests = d; } } r = routes->next; free(routes); routes = r; } while (clusters != NULL) { switch (clusters->type) { case CARBON_CH: case FNV1A_CH: case JUMP_CH: assert(clusters->members.ch != NULL); ch_free(clusters->members.ch->ring); while (clusters->members.ch->servers) { s = clusters->members.ch->servers->next; free(clusters->members.ch->servers); clusters->members.ch->servers = s; } free(clusters->members.ch); break; case FORWARD: case FILELOG: case FILELOGIP: case BLACKHOLE: while (clusters->members.forward) { server_shutdown(clusters->members.forward->server); free(clusters->members.forward->server); s = clusters->members.forward->next; free(clusters->members.forward); clusters->members.forward = s; } break; case ANYOF: case FAILOVER: /* in case of secondaries, make sure nothing references * the servers anymore */ for (s = clusters->members.anyof->list; s != NULL; s = s->next) server_shutdown(s->server); while (clusters->members.anyof->list) { free(clusters->members.anyof->list->server); s = clusters->members.anyof->list->next; free(clusters->members.anyof->list); clusters->members.anyof->list = s; } free(clusters->members.anyof->servers); free(clusters->members.anyof); break; case GROUP: case AGGRSTUB: /* handled at the routes above */ break; case AGGREGATION: /* aggregators starve when they get no more input */ break; case REWRITE: if (clusters->members.replacement) free(clusters->members.replacement); break; } if (clusters->name) free(clusters->name); c = clusters->next; free(clusters); clusters = c; } } inline static char router_metric_matches( const route *r, char *metric, char *firstspace, regmatch_t *pmatch) { char ret = 0; switch (r->matchtype) { case MATCHALL: ret = 1; break; case REGEX: *firstspace = '\0'; ret = regexec(&r->rule, metric, r->nmatch, pmatch, 0) == 0; *firstspace = ' '; break; case CONTAINS: *firstspace = '\0'; ret = strstr(metric, r->strmatch) != NULL; *firstspace = ' '; break; case STARTS_WITH: ret = strncmp(metric, r->strmatch, strlen(r->strmatch)) == 0; break; case ENDS_WITH: *firstspace = '\0'; ret = strcmp( firstspace - strlen(r->strmatch), r->strmatch) == 0; *firstspace = ' '; break; case MATCHES: *firstspace = '\0'; ret = strcmp(metric, r->strmatch) == 0; *firstspace = ' '; break; default: ret = 0; break; } return ret; } inline size_t router_rewrite_metric( char (*newmetric)[METRIC_BUFSIZ], char **newfirstspace, const char *metric, const char *firstspace, const char *replacement, const size_t nmatch, const regmatch_t *pmatch) { char escape = 0; int ref = 0; char *s = *newmetric; const char *p; const char *q; const char *t; enum rewrite_case { RETAIN, LOWER, UPPER } rcase = RETAIN; assert(pmatch != NULL); /* insert leading part */ q = metric; t = metric + pmatch[0].rm_so; if (s - *newmetric + t - q < sizeof(*newmetric)) { while (q < t) *s++ = *q++; } else { return 0; /* won't fit, don't try further */ } for (p = replacement; ; p++) { switch (*p) { case '\\': if (!escape) { escape = 1; rcase = RETAIN; break; } /* fall through so we handle \1\2 */ default: if (escape == 1 && rcase == RETAIN && *p == '_') { rcase = LOWER; } else if (escape == 1 && rcase == RETAIN && *p == '^') { rcase = UPPER; } else if (escape && *p >= '0' && *p <= '9') { escape = 2; ref *= 10; ref += *p - '0'; } else { if (escape) { if (ref > 0 && ref <= nmatch && pmatch[ref].rm_so >= 0) { /* insert match part */ q = metric + pmatch[ref].rm_so; t = metric + pmatch[ref].rm_eo; if (s - *newmetric + t - q < sizeof(*newmetric)) { switch (rcase) { case RETAIN: while (q < t) *s++ = *q++; break; case LOWER: while (q < t) *s++ = (char)tolower(*q++); break; case UPPER: while (q < t) *s++ = (char)toupper(*q++); break; } } } ref = 0; } if (*p != '\\') { /* \1\2 case */ escape = 0; rcase = RETAIN; if (s - *newmetric + 1 < sizeof(*newmetric)) *s++ = *p; } } break; } if (*p == '\0') break; } /* undo trailing \0 */ s--; /* insert remaining part */ q = metric + pmatch[0].rm_eo; t = firstspace; if (s - *newmetric + t - q < sizeof(*newmetric)) { while (q < t) *s++ = *q++; } else { return 0; /* won't fit, don't try further */ } /* record new position of firstspace */ *newfirstspace = s; /* copy data part */ if (s - *newmetric + strlen(firstspace) < sizeof(*newmetric)) { for (p = firstspace; *p != '\0'; p++) *s++ = *p; *s++ = '\0'; return s - *newmetric; } return 0; /* we couldn't copy everything */ } static char router_route_intern( char *blackholed, destination ret[], size_t *curlen, size_t retsize, char *srcaddr, char *metric, char *firstspace, const route *r) { const route *w; destinations *d; char stop = 0; const char *p; const char *q = NULL; /* pacify compiler, won't happen in reality */ const char *t; char newmetric[METRIC_BUFSIZ]; char *newfirstspace = NULL; size_t len; regmatch_t pmatch[RE_MAX_MATCHES]; #define failif(RETLEN, WANTLEN) \ if (WANTLEN > RETLEN) { \ logerr("router_route: out of destination slots, " \ "increase CONN_DESTS_SIZE in router.h\n"); \ return 1; \ } for (w = r; w != NULL && keep_running; w = w->next) { if (w->dests->cl->type == GROUP) { /* strrstr doesn't exist, grrr * therefore the pattern in the group is stored in reverse, * such that we can start matching the tail easily without * having to calculate the end of the pattern string all the * time */ for (p = firstspace - 1; p >= metric; p--) { for (q = w->dests->cl->name, t = p; *q != '\0' && t >= metric; q++, t--) { if (*q != *t) break; } if (*q == '\0') break; } /* indirection */ assert(q != NULL); if (*q == '\0') stop = router_route_intern( blackholed, ret, curlen, retsize, srcaddr, metric, firstspace, w->dests->cl->members.routes); } else if (router_metric_matches(w, metric, firstspace, pmatch)) { stop = w->stop; /* rule matches, send to destination(s) */ for (d = w->dests; d != NULL; d = d->next) { switch (d->cl->type) { case BLACKHOLE: break; case FILELOGIP: { servers *s; snprintf(newmetric, sizeof(newmetric), "%s %s", srcaddr, metric); for (s = d->cl->members.forward; s != NULL; s = s->next) { failif(retsize, *curlen + 1); ret[*curlen].dest = s->server; ret[(*curlen)++].metric = strdup(newmetric); } *blackholed = 0; } break; case FILELOG: case FORWARD: { /* simple case, no logic necessary */ servers *s; for (s = d->cl->members.forward; s != NULL; s = s->next) { failif(retsize, *curlen + 1); ret[*curlen].dest = s->server; ret[(*curlen)++].metric = strdup(metric); } *blackholed = 0; } break; case ANYOF: { /* we queue the same metrics at the same server */ unsigned int hash; fnv1a_32(hash, p, metric, firstspace); /* We could use the retry approach here, but since * our c is very small compared to MAX_INT, the bias * we introduce for the last few of the range * (MAX_INT % c) can be considered neglicible given * the number of occurances of c in the range of * MAX_INT, therefore we stick with a simple mod. */ hash %= d->cl->members.anyof->count; failif(retsize, *curlen + 1); ret[*curlen].dest = d->cl->members.anyof->servers[hash]; ret[(*curlen)++].metric = strdup(metric); *blackholed = 0; } break; case FAILOVER: { /* queue at the first non-failing server */ unsigned short i; failif(retsize, *curlen + 1); ret[*curlen].dest = NULL; for (i = 0; i < d->cl->members.anyof->count; i++) { server *s = d->cl->members.anyof->servers[i]; if (server_failed(s)) continue; ret[*curlen].dest = s; break; } if (ret[*curlen].dest == NULL) /* all failed, take first server */ ret[*curlen].dest = d->cl->members.anyof->servers[0]; ret[(*curlen)++].metric = strdup(metric); *blackholed = 0; } break; case CARBON_CH: case FNV1A_CH: case JUMP_CH: { /* let the ring(bearer) decide */ failif(retsize, *curlen + d->cl->members.ch->repl_factor); ch_get_nodes( &ret[*curlen], d->cl->members.ch->ring, d->cl->members.ch->repl_factor, metric, firstspace); *curlen += d->cl->members.ch->repl_factor; *blackholed = 0; } break; case AGGREGATION: { /* aggregation rule */ aggregator_putmetric( d->cl->members.aggregation, metric, firstspace, w->nmatch, pmatch); *blackholed = 0; /* we need to break out of the inner loop. since * the rest of dests are meant for the stub, and * we should certainly not process it now */ while (d->next != NULL) d = d->next; } break; case REWRITE: { /* rewrite metric name */ if ((len = router_rewrite_metric( &newmetric, &newfirstspace, metric, firstspace, d->cl->members.replacement, w->nmatch, pmatch)) == 0) { logerr("router_route: failed to rewrite " "metric: newmetric size too small to hold " "replacement (%s -> %s)\n", metric, d->cl->members.replacement); break; }; /* scary! write back the rewritten metric */ memcpy(metric, newmetric, len); firstspace = metric + (newfirstspace - newmetric); } break; case AGGRSTUB: { /* strip off the stub pattern, and reroute this * thing */ router_route_intern( blackholed, ret, curlen, retsize, srcaddr, metric + strlen(w->pattern), firstspace + strlen(w->pattern), w->dests->cl->members.routes); } break; case GROUP: { /* this should not happen */ } break; } } /* stop processing further rules if requested */ if (stop) break; } } return stop; } /** * Looks up the locations the given metric_path should be sent to, and * returns the list of servers in ret, the number of servers is * returned in retcnt. * Returns whether the metric was blackholed (e.g. not routed anywhere). */ inline char router_route( destination ret[], size_t *retcnt, size_t retsize, char *srcaddr, char *metric, char *firstspace, route *routes) { size_t curlen = 0; char blackholed = 1; (void)router_route_intern(&blackholed, ret, &curlen, retsize, srcaddr, metric, firstspace, routes); *retcnt = curlen; return blackholed; } /** * Prints for metric_path which rules and/or aggregations would be * triggered. Useful for testing regular expressions. */ char router_test_intern(char *metric, char *firstspace, route *routes) { route *w; destinations *d; char gotmatch = 0; char newmetric[METRIC_BUFSIZ]; char *newfirstspace = NULL; size_t len; regmatch_t pmatch[RE_MAX_MATCHES]; for (w = routes; w != NULL; w = w->next) { if (w->dests->cl->type == GROUP) { /* just recurse, in test mode performance shouldn't be an * issue at all */ gotmatch |= router_test_intern( metric, firstspace, w->dests->cl->members.routes); if (gotmatch & 2) break; } else if (router_metric_matches(w, metric, firstspace, pmatch)) { gotmatch = 1; switch (w->dests->cl->type) { case AGGREGATION: fprintf(stdout, "aggregation\n"); break; case REWRITE: fprintf(stdout, "rewrite\n"); break; case AGGRSTUB: { gotmatch |= router_test_intern( metric + strlen(w->pattern), firstspace + strlen(w->pattern), w->dests->cl->members.routes); return gotmatch; } break; default: fprintf(stdout, "match\n"); break; } *firstspace = '\0'; switch (w->matchtype) { case MATCHALL: fprintf(stdout, " * -> %s\n", metric); break; case REGEX: fprintf(stdout, " %s (regex) -> %s\n", w->pattern, metric); break; default: { char *x; switch (w->matchtype) { case CONTAINS: x = "strstr"; break; case STARTS_WITH: x = "strncmp"; break; case ENDS_WITH: x = "tailcmp"; break; case MATCHES: x = "strcmp"; break; default: x = "!impossible?"; break; } fprintf(stdout, " %s [%s: %s]\n -> %s\n", w->pattern, x, w->strmatch, metric); } break; } *firstspace = ' '; for (d = w->dests; d != NULL; d = d->next) { switch (d->cl->type) { case AGGREGATION: { struct _aggr_computes *ac; char newmetric[METRIC_BUFSIZ]; char *newfirstspace = NULL; int stublen = 0; char percentile[16]; if (mode == DEBUGTEST || d->next == NULL) { stublen = 0; } else { char x; stublen = snprintf(&x, 1, "_stub_aggregator_%p__", d->cl->members.aggregation); } for (ac = d->cl->members.aggregation->computes; ac != NULL; ac = ac->next) { if (w->nmatch == 0 || (len = router_rewrite_metric( &newmetric, &newfirstspace, metric, firstspace, ac->metric, w->nmatch, pmatch)) == 0) { if (w->nmatch > 0) { fprintf(stderr, "router_test: failed to " "rewrite metric: newmetric size too " "small to hold replacement " "(%s -> %s)\n", metric, ac->metric); break; } len = snprintf(newmetric, sizeof(newmetric), "%s", ac->metric); if (len >= sizeof(newmetric)) len = sizeof(newmetric) - 1; newfirstspace = newmetric + len; } snprintf(percentile, sizeof(percentile), "percentile%d", ac->percentile); fprintf(stdout, " %s%s%s%s -> %s\n", ac->type == SUM ? "sum" : ac->type == CNT ? "count" : ac->type == MAX ? "max" : ac->type == MIN ? "min" : ac->type == AVG ? "average" : ac->type == MEDN ? "median" : ac->type == PCTL ? percentile : ac->type == VAR ? "variance" : ac->type == SDEV ? "stddev" : "", w->nmatch > 0 ? "(" : "", w->nmatch > 0 ? ac->metric + stublen : "", w->nmatch > 0 ? ")" : "", newmetric + stublen); if (mode == DEBUGTEST && d->next != NULL) { gotmatch |= router_test_intern( newmetric, newfirstspace, routes); } } if (mode == DEBUGTEST) { return gotmatch; } else { gotmatch |= 4; } } break; case BLACKHOLE: { fprintf(stdout, " blackholed\n"); } break; case REWRITE: { /* rewrite metric name */ if ((len = router_rewrite_metric( &newmetric, &newfirstspace, metric, firstspace, d->cl->members.replacement, w->nmatch, pmatch)) == 0) { fprintf(stderr, "router_test: failed to rewrite " "metric: newmetric size too small to hold " "replacement (%s -> %s)\n", metric, d->cl->members.replacement); break; }; /* scary! write back the rewritten metric */ memcpy(metric, newmetric, len); firstspace = metric + (newfirstspace - newmetric); *firstspace = '\0'; fprintf(stdout, " into(%s) -> %s\n", d->cl->members.replacement, metric); *firstspace = ' '; } break; case FORWARD: { servers *s; fprintf(stdout, " forward(%s)\n", d->cl->name); for (s = d->cl->members.forward; s != NULL; s = s->next) fprintf(stdout, " %s:%d\n", server_ip(s->server), server_port(s->server)); } break; case CARBON_CH: case FNV1A_CH: case JUMP_CH: { destination dst[CONN_DESTS_SIZE]; int i; fprintf(stdout, " %s_ch(%s)\n", d->cl->type == CARBON_CH ? "carbon" : d->cl->type == FNV1A_CH ? "fnv1a" : "jump_fnv1a", d->cl->name); if (gotmatch & 4) break; if (mode == DEBUGTEST) { fprintf(stdout, " hash_pos(%d)\n", ch_gethashpos(d->cl->members.ch->ring, metric, firstspace)); } ch_get_nodes(dst, d->cl->members.ch->ring, d->cl->members.ch->repl_factor, metric, firstspace); for (i = 0; i < d->cl->members.ch->repl_factor; i++) { fprintf(stdout, " %s:%d\n", server_ip(dst[i].dest), server_port(dst[i].dest)); free((char *)dst[i].metric); } } break; case FAILOVER: case ANYOF: { unsigned int hash; fprintf(stdout, " %s(%s)\n", d->cl->type == ANYOF ? "any_of" : "failover", d->cl->name); if (gotmatch & 4) break; if (d->cl->type == ANYOF) { const char *p; fnv1a_32(hash, p, metric, firstspace); hash %= d->cl->members.anyof->count; } else { hash = 0; } fprintf(stdout, " %s:%d\n", server_ip(d->cl->members.anyof->servers[hash]), server_port(d->cl->members.anyof->servers[hash])); } break; default: { fprintf(stdout, " cluster(%s)\n", d->cl->name); } break; } } if (w->stop) { gotmatch = 3; fprintf(stdout, " stop\n"); break; } } } return gotmatch; } void router_test(char *metric, route *routes) { char *firstspace; for (firstspace = metric; *firstspace != '\0'; firstspace++) if (*firstspace == ' ') break; if (!router_test_intern(metric, firstspace, routes)) { *firstspace = '\0'; fprintf(stdout, "nothing matched %s\n", metric); } fflush(stdout); } void router_shutdown(void) { keep_running = 0; } carbon-c-relay-1.7/router.h000066400000000000000000000032511265266732300156650ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef ROUTER_H #define ROUTER_H 1 #include #include #include "server.h" #include "aggregator.h" #define CONN_DESTS_SIZE 64 typedef struct { const char *metric; server *dest; } destination; typedef struct _cluster cluster; typedef struct _route route; #define RE_MAX_MATCHES 64 int router_readconfig(cluster **clret, route **rret, aggregator **aret, const char *path, size_t queuesize, size_t batchsize, unsigned short iotimeout); void router_optimise(route **routes); size_t router_rewrite_metric(char (*newmetric)[METRIC_BUFSIZ], char **newfirstspace, const char *metric, const char *firstspace, const char *replacement, const size_t nmatch, const regmatch_t *pmatch); void router_printconfig(FILE *f, char mode, cluster *clusters, route *routes); char router_route(destination ret[], size_t *retcnt, size_t retsize, char *srcaddr, char *metric, char *firstspace, route *routes); void router_test(char *metric_path, route *routes); server **router_getservers(cluster *clusters); void router_shutdown(void); void router_free(cluster *clusters, route *r); #endif carbon-c-relay-1.7/server.c000066400000000000000000000477441265266732300156650ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "relay.h" #include "queue.h" #include "dispatcher.h" #include "collector.h" #include "server.h" struct _server { const char *ip; unsigned short port; char *instance; struct addrinfo *saddr; int fd; queue *queue; size_t bsize; short iotimeout; const char **batch; serv_ctype ctype; pthread_t tid; struct _server **secondaries; size_t secondariescnt; char failover:1; char failure:5; char running:1; char keep_running:1; size_t metrics; size_t dropped; size_t stalls; size_t ticks; size_t prevmetrics; size_t prevdropped; size_t prevstalls; size_t prevticks; }; /** * Reads from the queue and sends items to the remote server. This * function is designed to be a thread. Data sending is attempted to be * batched, but sent one by one to reduce loss on sending failure. * A connection with the server is maintained for as long as there is * data to be written. As soon as there is none, the connection is * dropped. */ static void * server_queuereader(void *d) { server *self = (server *)d; size_t len; ssize_t slen; const char **metric = self->batch; struct timeval start, stop; struct timeval timeout; int timeoutms; queue *queue; char idle = 0; size_t *secpos = NULL; *metric = NULL; self->metrics = 0; self->ticks = 0; #define FAIL_WAIT_TIME 6 /* 6 * 250ms = 1.5s */ #define DISCONNECT_WAIT_TIME 12 /* 12 * 250ms = 3s */ #define LEN_CRITICAL(Q) (queue_free(Q) < self->bsize) self->running = 1; while (1) { if (queue_len(self->queue) == 0) { /* if we're idling, close the TCP connection, this allows us * to reduce connections, while keeping the connection alive * if we're writing a lot */ gettimeofday(&start, NULL); if (self->ctype == CON_TCP && self->fd >= 0 && idle++ > DISCONNECT_WAIT_TIME) { close(self->fd); self->fd = -1; } gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); if (!self->keep_running) break; /* nothing to do, so slow down for a bit */ usleep((200 + (rand() % 100)) * 1000); /* 200ms - 300ms */ /* if we are in failure mode, keep checking if we can * connect, this avoids unnecessary queue moves */ if (!self->failure) /* it makes no sense to try and do something, so skip */ continue; } else if (self->secondariescnt > 0 && (self->failure >= FAIL_WAIT_TIME || (!self->failover && LEN_CRITICAL(self->queue)))) { size_t i; gettimeofday(&start, NULL); if (self->secondariescnt > 0) { if (secpos == NULL) { secpos = malloc(sizeof(size_t) * self->secondariescnt); if (secpos == NULL) { logerr("server: failed to allocate memory " "for secpos\n"); gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); continue; } for (i = 0; i < self->secondariescnt; i++) secpos[i] = i; } if (!self->failover) { /* randomise the failover list such that in the * grand scheme of things we don't punish the first * working server in the list to deal with all * traffic meant for a now failing server */ for (i = 0; i < self->secondariescnt; i++) { size_t n = rand() % (self->secondariescnt - i); if (n != i) { size_t t = secpos[n]; secpos[n] = secpos[i]; secpos[i] = t; } } } } /* offload data from our queue to our secondaries * when doing so, observe the following: * - avoid nodes that are in failure mode * - avoid nodes which queues are >= critical_len * when no nodes remain given the above * - send to nodes which queue size < critical_len * where there are no such nodes * - do nothing (we will overflow, since we can't send * anywhere) */ *metric = NULL; queue = NULL; for (i = 0; i < self->secondariescnt; i++) { /* both conditions below make sure we skip ourself */ if (self->secondaries[secpos[i]]->failure) continue; queue = self->secondaries[secpos[i]]->queue; if (!self->failover && LEN_CRITICAL(queue)) { queue = NULL; continue; } if (*metric == NULL) { /* send up to batch size of our queue to this queue */ len = queue_dequeue_vector( self->batch, self->queue, self->bsize); self->batch[len] = NULL; metric = self->batch; } for (; *metric != NULL; metric++) if (!queue_putback(queue, *metric)) break; /* try to put back stuff that didn't fit */ for (; *metric != NULL; metric++) if (!queue_putback(self->queue, *metric)) break; } for (; *metric != NULL; metric++) { if (mode == DEBUG) logerr("dropping metric: %s", *metric); free((char *)*metric); self->dropped++; } gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); if (queue == NULL) { /* we couldn't do anything, take it easy for a bit */ if (self->failure) self->failure = 1; if (!self->keep_running) break; usleep((200 + (rand() % 100)) * 1000); /* 200ms - 300ms */ } } else if (self->failure) { if (!self->keep_running) break; usleep((200 + (rand() % 100)) * 1000); /* 200ms - 300ms */ } /* at this point we've got work to do, if we're instructed to * shut down, however, try to get everything out of the door * (until we fail, see top of this loop) */ gettimeofday(&start, NULL); /* try to connect */ if (self->fd < 0) { if (self->ctype == CON_PIPE) { int intconn[2]; if (pipe(intconn) < 0) { if (!self->failure) logerr("failed to create pipe: %s\n", strerror(errno)); self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } dispatch_addconnection(intconn[0]); self->fd = intconn[1]; } else if (self->ctype == CON_UDP) { if ((self->fd = socket(self->saddr->ai_family, self->saddr->ai_socktype, self->saddr->ai_protocol)) < 0) { if (!self->failure) logerr("failed to create udp socket: %s\n", strerror(errno)); self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } if (connect(self->fd, self->saddr->ai_addr, self->saddr->ai_addrlen) < 0) { if (!self->failure) logerr("failed to connect udp socket: %s\n", strerror(errno)); close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } } else if (self->ctype == CON_FILE) { if ((self->fd = open(self->ip, O_WRONLY | O_APPEND | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) < 0) { if (!self->failure) logerr("failed to open file '%s': %s\n", self->ip, strerror(errno)); self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } } else { int ret; int args; if ((self->fd = socket(self->saddr->ai_family, self->saddr->ai_socktype, self->saddr->ai_protocol)) < 0) { if (!self->failure) logerr("failed to create socket: %s\n", strerror(errno)); self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } /* put socket in non-blocking mode such that we can * poll() (time-out) on the connect() call */ args = fcntl(self->fd, F_GETFL, NULL); (void) fcntl(self->fd, F_SETFL, args | O_NONBLOCK); ret = connect(self->fd, self->saddr->ai_addr, self->saddr->ai_addrlen); if (ret < 0 && errno == EINPROGRESS) { /* wait for connection to succeed if the OS thinks * it can succeed */ struct pollfd ufds[1]; ufds[0].fd = self->fd; ufds[0].events = POLLIN | POLLOUT; ret = poll(ufds, 1, self->iotimeout + (rand() % 100)); if (ret == 0) { /* time limit expired */ if (!self->failure) logerr("failed to connect() to " "%s:%u: Operation timed out\n", self->ip, self->port); close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } else if (ret < 0) { /* some select error occurred */ if (!self->failure) logerr("failed to poll() for %s:%u: %s\n", self->ip, self->port, strerror(errno)); close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } else { if (ufds[0].revents & POLLHUP) { if (!self->failure) logerr("failed to connect() for %s:%u: " "Hangup\n", self->ip, self->port); close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } } } else if (ret < 0) { if (!self->failure) { logerr("failed to connect() to %s:%u: %s\n", self->ip, self->port, strerror(errno)); dispatch_check_rlimit_and_warn(); } close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; continue; } /* make socket blocking again */ (void) fcntl(self->fd, F_SETFL, args); } /* ensure we will break out of connections being stuck */ timeoutms = self->iotimeout + (rand() % 100); timeout.tv_sec = timeoutms / 1000; timeout.tv_usec = (timeoutms % 1000) * 1000; setsockopt(self->fd, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(timeout)); #ifdef SO_NOSIGPIPE setsockopt(self->fd, SOL_SOCKET, SO_NOSIGPIPE, NULL, 0); #endif } /* send up to batch size */ len = queue_dequeue_vector(self->batch, self->queue, self->bsize); self->batch[len] = NULL; metric = self->batch; if (len != 0 && !self->keep_running) { /* be noisy during shutdown so we can track any slowing down * servers, possibly preventing us to shut down */ logerr("shutting down %s:%u: waiting for %zu metrics\n", self->ip, self->port, len + queue_len(self->queue)); } if (len == 0 && self->failure) { /* if we don't have anything to send, we have at least a * connection succeed, so assume the server is up again, * this is in particular important for recovering this * node by probes, to avoid starvation of this server since * its queue is possibly being offloaded to secondaries */ if (self->ctype != CON_UDP) logerr("server %s:%u: OK after probe\n", self->ip, self->port); self->failure = 0; } for (; *metric != NULL; metric++) { len = strlen(*metric); if ((slen = write(self->fd, *metric, len)) != len) { /* not fully sent, or failure, close connection * regardless so we don't get synchonisation problems, * partially sent data is an error for us, since we use * blocking sockets, and hence partial sent is * indication of a failure */ if (self->ctype != CON_UDP && !self->failure) logerr("failed to write() to %s:%u: %s\n", self->ip, self->port, (slen < 0 ? strerror(errno) : "uncomplete write")); close(self->fd); self->fd = -1; self->failure += self->failure >= FAIL_WAIT_TIME ? 0 : 1; /* put back stuff we couldn't process */ for (; *metric != NULL; metric++) { if (!queue_putback(self->queue, *metric)) { if (mode == DEBUG) logerr("server %s:%u: dropping metric: %s", self->ip, self->port, *metric); free((char *)*metric); self->dropped++; } } break; } else if (self->failure) { if (self->ctype != CON_UDP) logerr("server %s:%u: OK\n", self->ip, self->port); self->failure = 0; } free((char *)*metric); self->metrics++; } gettimeofday(&stop, NULL); self->ticks += timediff(start, stop); idle = 0; } self->running = 0; if (self->fd >= 0) close(self->fd); return NULL; } /** * Allocate a new (outbound) server. Effectively this means a thread * that reads from the queue and sends this as good as it can to the ip * address and port associated. */ server * server_new( const char *ip, unsigned short port, serv_ctype ctype, struct addrinfo *saddr, size_t qsize, size_t bsize, unsigned short iotimeout) { server *ret; if ((ret = malloc(sizeof(server))) == NULL) return NULL; ret->ctype = ctype; ret->tid = 0; ret->secondaries = NULL; ret->secondariescnt = 0; ret->ip = strdup(ip); ret->port = port; ret->instance = NULL; ret->bsize = bsize; ret->iotimeout = iotimeout < 250 ? 600 : iotimeout; if ((ret->batch = malloc(sizeof(char *) * (bsize + 1))) == NULL) { free(ret); return NULL; } ret->fd = -1; ret->saddr = saddr; ret->queue = queue_new(qsize); if (ret->queue == NULL) { free((char *)ret->ip); free(ret); return NULL; } ret->failover = 0; ret->failure = 0; ret->running = 0; ret->keep_running = 1; ret->metrics = 0; ret->dropped = 0; ret->stalls = 0; ret->ticks = 0; ret->prevmetrics = 0; ret->prevdropped = 0; ret->prevstalls = 0; ret->prevticks = 0; if (pthread_create(&ret->tid, NULL, &server_queuereader, ret) != 0) { free((char *)ret->ip); queue_destroy(ret->queue); free(ret); return NULL; } return ret; } /** * Adds a list of secondary servers to this server. A secondary server * is a server which' queue will be checked when this server has nothing * to do. This is different from a backup server in that all servers * involved have their own queue which they are supposed to deal with. */ void server_add_secondaries(server *self, server **secondaries, size_t count) { self->secondaries = secondaries; self->secondariescnt = count; } /** * Flags this server as part of a failover cluster, which means the * secondaries are used only to offload on failure, not on queue stress. */ void server_set_failover(server *self) { self->failover = 1; } /** * Sets instance name only used for carbon_ch cluster type. */ void server_set_instance(server *self, char *instance) { self->instance = strdup(instance); } /** * Thin wrapper around the associated queue with the server object. * Returns true if the metric could be queued for sending, or the metric * was dropped because the associated server is down. Returns false * otherwise (when a retry seems like it could succeed shortly). */ inline char server_send(server *s, const char *d, char force) { if (queue_free(s->queue) == 0) { char failure = s->failure; if (!force && s->secondariescnt > 0) { size_t i; /* don't immediately drop if we know there are others that * back us up */ for (i = 0; i < s->secondariescnt; i++) { if (!s->secondaries[i]->failure) { failure = 0; break; } } } if (failure || force) { s->dropped++; /* excess event will be dropped by the enqueue below */ } else { s->stalls++; return 0; } } queue_enqueue(s->queue, d); return 1; } /** * Signals this server to stop whatever it's doing. */ void server_stop(server *s) { if (s->secondariescnt == 0) s->keep_running = 0; } /** * Waits for this server to finish sending pending items from its queue. */ void server_shutdown(server *s) { int i; pthread_t tid; size_t failures; size_t inqueue; int err; const char *p; if (s->tid == 0) return; tid = s->tid; s->tid = 0; if (s->secondariescnt > 0) { /* if we have a working connection, or we still have stuff in * our queue, wait for our secondaries, as they might need us, * or we need them */ do { failures = 0; inqueue = 0; for (i = 0; i < s->secondariescnt; i++) { if (s->secondaries[i]->failure) failures++; if (s->secondaries[i]->running) inqueue += queue_len(s->secondaries[i]->queue); } /* loop until we all failed, or nothing is in the queues */ } while (failures != s->secondariescnt && inqueue != 0 && logerr("any_of cluster pending %zu metrics " "(with %zu failed nodes)\n", inqueue, failures) >= -1 && usleep((200 + (rand() % 100)) * 1000) <= 0); /* shut down entire cluster */ for (i = 0; i < s->secondariescnt; i++) s->secondaries[i]->keep_running = 0; /* to pretend to be dead for above loop (just in case) */ if (inqueue != 0) for (i = 0; i < s->secondariescnt; i++) s->secondaries[i]->failure = 1; } s->keep_running = 0; if ((err = pthread_join(tid, NULL)) != 0) logerr("%s:%u: failed to join server thread: %s\n", s->ip, s->port, strerror(err)); if (s->ctype == CON_TCP) { size_t qlen = queue_len(s->queue); if (qlen > 0) logerr("dropping %zu metrics for %s:%u\n", qlen, s->ip, s->port); } /* drain queue not to leak the memory consumed by pending metrics */ while ((p = queue_dequeue(s->queue)) != NULL) free((char *)p); queue_destroy(s->queue); free(s->batch); if (s->instance) free(s->instance); if (s->saddr != NULL) freeaddrinfo(s->saddr); free((char *)s->ip); s->ip = NULL; } /** * Returns the ip address this server points to. */ inline const char * server_ip(server *s) { if (s == NULL) return NULL; return s->ip; } /** * Returns the port this server connects at. */ inline unsigned short server_port(server *s) { if (s == NULL) return 0; return s->port; } /** * Returns the instance associated with this server. */ inline char * server_instance(server *s) { return s->instance; } /** * Returns the connection type of this server. */ inline serv_ctype server_ctype(server *s) { if (s == NULL) return CON_PIPE; return s->ctype; } /** * Returns whether the last action on this server caused a failure. */ inline char server_failed(server *s) { if (s == NULL) return 0; return s->failure; } /** * Returns the wall-clock time in microseconds (us) consumed sending metrics. */ inline size_t server_get_ticks(server *s) { if (s == NULL) return 0; return s->ticks; } /** * Returns the wall-clock time in microseconds (us) consumed since last * call to this function. */ inline size_t server_get_ticks_sub(server *s) { size_t d; if (s == NULL) return 0; d = s->ticks - s->prevticks; s->prevticks += d; return d; } /** * Returns the number of metrics sent since start. */ inline size_t server_get_metrics(server *s) { if (s == NULL) return 0; return s->metrics; } /** * Returns the number of metrics sent since last call to this function. */ inline size_t server_get_metrics_sub(server *s) { size_t d; if (s == NULL) return 0; d = s->metrics - s->prevmetrics; s->prevmetrics += d; return d; } /** * Returns the number of metrics dropped since start. */ inline size_t server_get_dropped(server *s) { if (s == NULL) return 0; return s->dropped; } /** * Returns the number of metrics dropped since last call to this function. */ inline size_t server_get_dropped_sub(server *s) { size_t d; if (s == NULL) return 0; d = s->dropped - s->prevdropped; s->prevdropped += d; return d; } /** * Returns the number of stalls since start. A stall happens when the * queue is full, but it appears as if it would be a good idea to wait * for a brief period and retry. */ inline size_t server_get_stalls(server *s) { if (s == NULL) return 0; return s->stalls; } /** * Returns the number of stalls since last call to this function. */ inline size_t server_get_stalls_sub(server *s) { size_t d; if (s == NULL) return 0; d = s->stalls - s->prevstalls; s->prevstalls += d; return d; } /** * Returns the (approximate) number of metrics waiting to be sent. */ inline size_t server_get_queue_len(server *s) { if (s == NULL) return 0; return queue_len(s->queue); } /** * Returns the allocated size of the queue backing metrics waiting to be * sent. */ inline size_t server_get_queue_size(server *s) { if (s == NULL) return 0; return queue_size(s->queue); } carbon-c-relay-1.7/server.h000066400000000000000000000033111265266732300156500ustar00rootroot00000000000000/* * Copyright 2013-2016 Fabian Groffen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef SERVER_H #define SERVER_H 1 #include #include "relay.h" typedef struct _server server; server *server_new( const char *ip, unsigned short port, serv_ctype ctype, struct addrinfo *saddr, size_t queuesize, size_t batchsize, unsigned short iotimeout); void server_add_secondaries(server *d, server **sec, size_t cnt); void server_set_failover(server *d); void server_set_instance(server *d, char *inst); char server_send(server *s, const char *d, char force); void server_stop(server *s); void server_shutdown(server *s); const char *server_ip(server *s); unsigned short server_port(server *s); char *server_instance(server *s); serv_ctype server_ctype(server *s); char server_failed(server *s); size_t server_get_ticks(server *s); size_t server_get_metrics(server *s); size_t server_get_stalls(server *s); size_t server_get_dropped(server *s); size_t server_get_ticks_sub(server *s); size_t server_get_metrics_sub(server *s); size_t server_get_stalls_sub(server *s); size_t server_get_dropped_sub(server *s); size_t server_get_queue_len(server *s); size_t server_get_queue_size(server *s); #endif