pax_global_header00006660000000000000000000000064130737426460014526gustar00rootroot0000000000000052 comment=ce8abaa0c60c2d6bee7219f5ddf500e0a1457b28 memberlist-0.1.0/000077500000000000000000000000001307374264600136675ustar00rootroot00000000000000memberlist-0.1.0/.gitignore000066400000000000000000000004161307374264600156600ustar00rootroot00000000000000# Compiled Object files, Static and Dynamic libs (Shared Objects) *.o *.a *.so # Folders _obj _test # Architecture specific extensions/prefixes *.[568vq] [568vq].out *.cgo1.go *.cgo2.c _cgo_defun.c _cgo_gotypes.go _cgo_export.* _testmain.go *.exe *.test .vagrant/ memberlist-0.1.0/LICENSE000066400000000000000000000371511307374264600147030ustar00rootroot00000000000000Mozilla Public License, version 2.0 1. Definitions 1.1. “Contributor” means each individual or legal entity that creates, contributes to the creation of, or owns Covered Software. 1.2. “Contributor Version” means the combination of the Contributions of others (if any) used by a Contributor and that particular Contributor’s Contribution. 1.3. “Contribution” means Covered Software of a particular Contributor. 1.4. “Covered Software” means Source Code Form to which the initial Contributor has attached the notice in Exhibit A, the Executable Form of such Source Code Form, and Modifications of such Source Code Form, in each case including portions thereof. 1.5. “Incompatible With Secondary Licenses” means a. that the initial Contributor has attached the notice described in Exhibit B to the Covered Software; or b. that the Covered Software was made available under the terms of version 1.1 or earlier of the License, but not also under the terms of a Secondary License. 1.6. “Executable Form” means any form of the work other than Source Code Form. 1.7. “Larger Work” means a work that combines Covered Software with other material, in a separate file or files, that is not Covered Software. 1.8. “License” means this document. 1.9. “Licensable” means having the right to grant, to the maximum extent possible, whether at the time of the initial grant or subsequently, any and all of the rights conveyed by this License. 1.10. “Modifications” means any of the following: a. any file in Source Code Form that results from an addition to, deletion from, or modification of the contents of Covered Software; or b. any new file in Source Code Form that contains any Covered Software. 1.11. “Patent Claims” of a Contributor means any patent claim(s), including without limitation, method, process, and apparatus claims, in any patent Licensable by such Contributor that would be infringed, but for the grant of the License, by the making, using, selling, offering for sale, having made, import, or transfer of either its Contributions or its Contributor Version. 1.12. “Secondary License” means either the GNU General Public License, Version 2.0, the GNU Lesser General Public License, Version 2.1, the GNU Affero General Public License, Version 3.0, or any later versions of those licenses. 1.13. “Source Code Form” means the form of the work preferred for making modifications. 1.14. “You” (or “Your”) means an individual or a legal entity exercising rights under this License. For legal entities, “You” includes any entity that controls, is controlled by, or is under common control with You. For purposes of this definition, “control” means (a) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (b) ownership of more than fifty percent (50%) of the outstanding shares or beneficial ownership of such entity. 2. License Grants and Conditions 2.1. Grants Each Contributor hereby grants You a world-wide, royalty-free, non-exclusive license: a. under intellectual property rights (other than patent or trademark) Licensable by such Contributor to use, reproduce, make available, modify, display, perform, distribute, and otherwise exploit its Contributions, either on an unmodified basis, with Modifications, or as part of a Larger Work; and b. under Patent Claims of such Contributor to make, use, sell, offer for sale, have made, import, and otherwise transfer either its Contributions or its Contributor Version. 2.2. Effective Date The licenses granted in Section 2.1 with respect to any Contribution become effective for each Contribution on the date the Contributor first distributes such Contribution. 2.3. Limitations on Grant Scope The licenses granted in this Section 2 are the only rights granted under this License. No additional rights or licenses will be implied from the distribution or licensing of Covered Software under this License. Notwithstanding Section 2.1(b) above, no patent license is granted by a Contributor: a. for any code that a Contributor has removed from Covered Software; or b. for infringements caused by: (i) Your and any other third party’s modifications of Covered Software, or (ii) the combination of its Contributions with other software (except as part of its Contributor Version); or c. under Patent Claims infringed by Covered Software in the absence of its Contributions. This License does not grant any rights in the trademarks, service marks, or logos of any Contributor (except as may be necessary to comply with the notice requirements in Section 3.4). 2.4. Subsequent Licenses No Contributor makes additional grants as a result of Your choice to distribute the Covered Software under a subsequent version of this License (see Section 10.2) or under the terms of a Secondary License (if permitted under the terms of Section 3.3). 2.5. Representation Each Contributor represents that the Contributor believes its Contributions are its original creation(s) or it has sufficient rights to grant the rights to its Contributions conveyed by this License. 2.6. Fair Use This License is not intended to limit any rights You have under applicable copyright doctrines of fair use, fair dealing, or other equivalents. 2.7. Conditions Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in Section 2.1. 3. Responsibilities 3.1. Distribution of Source Form All distribution of Covered Software in Source Code Form, including any Modifications that You create or to which You contribute, must be under the terms of this License. You must inform recipients that the Source Code Form of the Covered Software is governed by the terms of this License, and how they can obtain a copy of this License. You may not attempt to alter or restrict the recipients’ rights in the Source Code Form. 3.2. Distribution of Executable Form If You distribute Covered Software in Executable Form then: a. such Covered Software must also be made available in Source Code Form, as described in Section 3.1, and You must inform recipients of the Executable Form how they can obtain a copy of such Source Code Form by reasonable means in a timely manner, at a charge no more than the cost of distribution to the recipient; and b. You may distribute such Executable Form under the terms of this License, or sublicense it under different terms, provided that the license for the Executable Form does not attempt to limit or alter the recipients’ rights in the Source Code Form under this License. 3.3. Distribution of a Larger Work You may create and distribute a Larger Work under terms of Your choice, provided that You also comply with the requirements of this License for the Covered Software. If the Larger Work is a combination of Covered Software with a work governed by one or more Secondary Licenses, and the Covered Software is not Incompatible With Secondary Licenses, this License permits You to additionally distribute such Covered Software under the terms of such Secondary License(s), so that the recipient of the Larger Work may, at their option, further distribute the Covered Software under the terms of either this License or such Secondary License(s). 3.4. Notices You may not remove or alter the substance of any license notices (including copyright notices, patent notices, disclaimers of warranty, or limitations of liability) contained within the Source Code Form of the Covered Software, except that You may alter any license notices to the extent required to remedy known factual inaccuracies. 3.5. Application of Additional Terms You may choose to offer, and to charge a fee for, warranty, support, indemnity or liability obligations to one or more recipients of Covered Software. However, You may do so only on Your own behalf, and not on behalf of any Contributor. You must make it absolutely clear that any such warranty, support, indemnity, or liability obligation is offered by You alone, and You hereby agree to indemnify every Contributor for any liability incurred by such Contributor as a result of warranty, support, indemnity or liability terms You offer. You may include additional disclaimers of warranty and limitations of liability specific to any jurisdiction. 4. Inability to Comply Due to Statute or Regulation If it is impossible for You to comply with any of the terms of this License with respect to some or all of the Covered Software due to statute, judicial order, or regulation then You must: (a) comply with the terms of this License to the maximum extent possible; and (b) describe the limitations and the code they affect. Such description must be placed in a text file included with all distributions of the Covered Software under this License. Except to the extent prohibited by statute or regulation, such description must be sufficiently detailed for a recipient of ordinary skill to be able to understand it. 5. Termination 5.1. The rights granted under this License will terminate automatically if You fail to comply with any of its terms. However, if You become compliant, then the rights granted under this License from a particular Contributor are reinstated (a) provisionally, unless and until such Contributor explicitly and finally terminates Your grants, and (b) on an ongoing basis, if such Contributor fails to notify You of the non-compliance by some reasonable means prior to 60 days after You have come back into compliance. Moreover, Your grants from a particular Contributor are reinstated on an ongoing basis if such Contributor notifies You of the non-compliance by some reasonable means, this is the first time You have received notice of non-compliance with this License from such Contributor, and You become compliant prior to 30 days after Your receipt of the notice. 5.2. If You initiate litigation against any entity by asserting a patent infringement claim (excluding declaratory judgment actions, counter-claims, and cross-claims) alleging that a Contributor Version directly or indirectly infringes any patent, then the rights granted to You by any and all Contributors for the Covered Software under Section 2.1 of this License shall terminate. 5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user license agreements (excluding distributors and resellers) which have been validly granted by You or Your distributors under this License prior to termination shall survive termination. 6. Disclaimer of Warranty Covered Software is provided under this License on an “as is” basis, without warranty of any kind, either expressed, implied, or statutory, including, without limitation, warranties that the Covered Software is free of defects, merchantable, fit for a particular purpose or non-infringing. The entire risk as to the quality and performance of the Covered Software is with You. Should any Covered Software prove defective in any respect, You (not any Contributor) assume the cost of any necessary servicing, repair, or correction. This disclaimer of warranty constitutes an essential part of this License. No use of any Covered Software is authorized under this License except under this disclaimer. 7. Limitation of Liability Under no circumstances and under no legal theory, whether tort (including negligence), contract, or otherwise, shall any Contributor, or anyone who distributes Covered Software as permitted above, be liable to You for any direct, indirect, special, incidental, or consequential damages of any character including, without limitation, damages for lost profits, loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses, even if such party shall have been informed of the possibility of such damages. This limitation of liability shall not apply to liability for death or personal injury resulting from such party’s negligence to the extent applicable law prohibits such limitation. Some jurisdictions do not allow the exclusion or limitation of incidental or consequential damages, so this exclusion and limitation may not apply to You. 8. Litigation Any litigation relating to this License may be brought only in the courts of a jurisdiction where the defendant maintains its principal place of business and such litigation shall be governed by laws of that jurisdiction, without reference to its conflict-of-law provisions. Nothing in this Section shall prevent a party’s ability to bring cross-claims or counter-claims. 9. Miscellaneous This License represents the complete agreement concerning the subject matter hereof. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable. Any law or regulation which provides that the language of a contract shall be construed against the drafter shall not be used to construe this License against a Contributor. 10. Versions of the License 10.1. New Versions Mozilla Foundation is the license steward. Except as provided in Section 10.3, no one other than the license steward has the right to modify or publish new versions of this License. Each version will be given a distinguishing version number. 10.2. Effect of New Versions You may distribute the Covered Software under the terms of the version of the License under which You originally received the Covered Software, or under the terms of any subsequent version published by the license steward. 10.3. Modified Versions If you create software not governed by this License, and you want to create a new license for such software, you may create and use a modified version of this License if you rename the license and remove any references to the name of the license steward (except to note that such modified license differs from this License). 10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses If You choose to distribute Source Code Form that is Incompatible With Secondary Licenses under the terms of this version of the License, the notice described in Exhibit B of this License must be attached. Exhibit A - Source Code Form License Notice This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. If it is not possible or desirable to put the notice in a particular file, then You may include the notice in a location (such as a LICENSE file in a relevant directory) where a recipient would be likely to look for such a notice. You may add additional accurate notices of copyright ownership. Exhibit B - “Incompatible With Secondary Licenses” Notice This Source Code Form is “Incompatible With Secondary Licenses”, as defined by the Mozilla Public License, v. 2.0. memberlist-0.1.0/Makefile000066400000000000000000000003601307374264600153260ustar00rootroot00000000000000test: subnet go test ./... integ: subnet INTEG_TESTS=yes go test ./... subnet: ./test/setup_subnet.sh cov: gocov test github.com/hashicorp/memberlist | gocov-html > /tmp/coverage.html open /tmp/coverage.html .PNONY: test cov integ memberlist-0.1.0/README.md000066400000000000000000000153631307374264600151560ustar00rootroot00000000000000# memberlist [![GoDoc](https://godoc.org/github.com/hashicorp/memberlist?status.png)](https://godoc.org/github.com/hashicorp/memberlist) memberlist is a [Go](http://www.golang.org) library that manages cluster membership and member failure detection using a gossip based protocol. The use cases for such a library are far-reaching: all distributed systems require membership, and memberlist is a re-usable solution to managing cluster membership and node failure detection. memberlist is eventually consistent but converges quickly on average. The speed at which it converges can be heavily tuned via various knobs on the protocol. Node failures are detected and network partitions are partially tolerated by attempting to communicate to potentially dead nodes through multiple routes. ## Building If you wish to build memberlist you'll need Go version 1.2+ installed. Please check your installation with: ``` go version ``` ## Usage Memberlist is surprisingly simple to use. An example is shown below: ```go /* Create the initial memberlist from a safe configuration. Please reference the godoc for other default config types. http://godoc.org/github.com/hashicorp/memberlist#Config */ list, err := memberlist.Create(memberlist.DefaultLocalConfig()) if err != nil { panic("Failed to create memberlist: " + err.Error()) } // Join an existing cluster by specifying at least one known member. n, err := list.Join([]string{"1.2.3.4"}) if err != nil { panic("Failed to join cluster: " + err.Error()) } // Ask for members of the cluster for _, member := range list.Members() { fmt.Printf("Member: %s %s\n", member.Name, member.Addr) } // Continue doing whatever you need, memberlist will maintain membership // information in the background. Delegates can be used for receiving // events when members join or leave. ``` The most difficult part of memberlist is configuring it since it has many available knobs in order to tune state propagation delay and convergence times. Memberlist provides a default configuration that offers a good starting point, but errs on the side of caution, choosing values that are optimized for higher convergence at the cost of higher bandwidth usage. For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/memberlist). ## Protocol memberlist is based on ["SWIM: Scalable Weakly-consistent Infection-style Process Group Membership Protocol"](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf), with a few minor adaptations, mostly to increase propagation speed and convergence rate. A high level overview of the memberlist protocol (based on SWIM) is described below, but for details please read the full [SWIM paper](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf) followed by the memberlist source. We welcome any questions related to the protocol on our issue tracker. ### Protocol Description memberlist begins by joining an existing cluster or starting a new cluster. If starting a new cluster, additional nodes are expected to join it. New nodes in an existing cluster must be given the address of at least one existing member in order to join the cluster. The new member does a full state sync with the existing member over TCP and begins gossiping its existence to the cluster. Gossip is done over UDP with a configurable but fixed fanout and interval. This ensures that network usage is constant with regards to number of nodes, as opposed to exponential growth that can occur with traditional heartbeat mechanisms. Complete state exchanges with a random node are done periodically over TCP, but much less often than gossip messages. This increases the likelihood that the membership list converges properly since the full state is exchanged and merged. The interval between full state exchanges is configurable or can be disabled entirely. Failure detection is done by periodic random probing using a configurable interval. If the node fails to ack within a reasonable time (typically some multiple of RTT), then an indirect probe as well as a direct TCP probe are attempted. An indirect probe asks a configurable number of random nodes to probe the same node, in case there are network issues causing our own node to fail the probe. The direct TCP probe is used to help identify the common situation where networking is misconfigured to allow TCP but not UDP. Without the TCP probe, a UDP-isolated node would think all other nodes were suspect and could cause churn in the cluster when it attempts a TCP-based state exchange with another node. It is not desirable to operate with only TCP connectivity because convergence will be much slower, but it is enabled so that memberlist can detect this situation and alert operators. If both our probe, the indirect probes, and the direct TCP probe fail within a configurable time, then the node is marked "suspicious" and this knowledge is gossiped to the cluster. A suspicious node is still considered a member of cluster. If the suspect member of the cluster does not dispute the suspicion within a configurable period of time, the node is finally considered dead, and this state is then gossiped to the cluster. This is a brief and incomplete description of the protocol. For a better idea, please read the [SWIM paper](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf) in its entirety, along with the memberlist source code. ### Changes from SWIM As mentioned earlier, the memberlist protocol is based on SWIM but includes minor changes, mostly to increase propagation speed and convergence rates. The changes from SWIM are noted here: * memberlist does a full state sync over TCP periodically. SWIM only propagates changes over gossip. While both eventually reach convergence, the full state sync increases the likelihood that nodes are fully converged more quickly, at the expense of more bandwidth usage. This feature can be totally disabled if you wish. * memberlist has a dedicated gossip layer separate from the failure detection protocol. SWIM only piggybacks gossip messages on top of probe/ack messages. memberlist also piggybacks gossip messages on top of probe/ack messages, but also will periodically send out dedicated gossip messages on their own. This feature lets you have a higher gossip rate (for example once per 200ms) and a slower failure detection rate (such as once per second), resulting in overall faster convergence rates and data propagation speeds. This feature can be totally disabed as well, if you wish. * memberlist stores around the state of dead nodes for a set amount of time, so that when full syncs are requested, the requester also receives information about dead nodes. Because SWIM doesn't do full syncs, SWIM deletes dead node state immediately upon learning that the node is dead. This change again helps the cluster converge more quickly. memberlist-0.1.0/alive_delegate.go000066400000000000000000000011131307374264600171440ustar00rootroot00000000000000package memberlist // AliveDelegate is used to involve a client in processing // a node "alive" message. When a node joins, either through // a UDP gossip or TCP push/pull, we update the state of // that node via an alive message. This can be used to filter // a node out and prevent it from being considered a peer // using application specific logic. type AliveDelegate interface { // NotifyMerge is invoked when a merge could take place. // Provides a list of the nodes known by the peer. If // the return value is non-nil, the merge is canceled. NotifyAlive(peer *Node) error } memberlist-0.1.0/awareness.go000066400000000000000000000033211307374264600162050ustar00rootroot00000000000000package memberlist import ( "sync" "time" "github.com/armon/go-metrics" ) // awareness manages a simple metric for tracking the estimated health of the // local node. Health is primary the node's ability to respond in the soft // real-time manner required for correct health checking of other nodes in the // cluster. type awareness struct { sync.RWMutex // max is the upper threshold for the timeout scale (the score will be // constrained to be from 0 <= score < max). max int // score is the current awareness score. Lower values are healthier and // zero is the minimum value. score int } // newAwareness returns a new awareness object. func newAwareness(max int) *awareness { return &awareness{ max: max, score: 0, } } // ApplyDelta takes the given delta and applies it to the score in a thread-safe // manner. It also enforces a floor of zero and a max of max, so deltas may not // change the overall score if it's railed at one of the extremes. func (a *awareness) ApplyDelta(delta int) { a.Lock() initial := a.score a.score += delta if a.score < 0 { a.score = 0 } else if a.score > (a.max - 1) { a.score = (a.max - 1) } final := a.score a.Unlock() if initial != final { metrics.SetGauge([]string{"memberlist", "health", "score"}, float32(final)) } } // GetHealthScore returns the raw health score. func (a *awareness) GetHealthScore() int { a.RLock() score := a.score a.RUnlock() return score } // ScaleTimeout takes the given duration and scales it based on the current // score. Less healthyness will lead to longer timeouts. func (a *awareness) ScaleTimeout(timeout time.Duration) time.Duration { a.RLock() score := a.score a.RUnlock() return timeout * (time.Duration(score) + 1) } memberlist-0.1.0/awareness_test.go000066400000000000000000000016441307374264600172520ustar00rootroot00000000000000package memberlist import ( "testing" "time" ) func TestAwareness(t *testing.T) { cases := []struct { delta int score int timeout time.Duration }{ {0, 0, 1 * time.Second}, {-1, 0, 1 * time.Second}, {-10, 0, 1 * time.Second}, {1, 1, 2 * time.Second}, {-1, 0, 1 * time.Second}, {10, 7, 8 * time.Second}, {-1, 6, 7 * time.Second}, {-1, 5, 6 * time.Second}, {-1, 4, 5 * time.Second}, {-1, 3, 4 * time.Second}, {-1, 2, 3 * time.Second}, {-1, 1, 2 * time.Second}, {-1, 0, 1 * time.Second}, {-1, 0, 1 * time.Second}, } a := newAwareness(8) for i, c := range cases { a.ApplyDelta(c.delta) if a.GetHealthScore() != c.score { t.Errorf("case %d: score mismatch %d != %d", i, a.score, c.score) } if timeout := a.ScaleTimeout(1 * time.Second); timeout != c.timeout { t.Errorf("case %d: scaled timeout mismatch %9.6f != %9.6f", i, timeout.Seconds(), c.timeout.Seconds()) } } } memberlist-0.1.0/broadcast.go000066400000000000000000000061771307374264600161730ustar00rootroot00000000000000package memberlist /* The broadcast mechanism works by maintaining a sorted list of messages to be sent out. When a message is to be broadcast, the retransmit count is set to zero and appended to the queue. The retransmit count serves as the "priority", ensuring that newer messages get sent first. Once a message hits the retransmit limit, it is removed from the queue. Additionally, older entries can be invalidated by new messages that are contradictory. For example, if we send "{suspect M1 inc: 1}, then a following {alive M1 inc: 2} will invalidate that message */ type memberlistBroadcast struct { node string msg []byte notify chan struct{} } func (b *memberlistBroadcast) Invalidates(other Broadcast) bool { // Check if that broadcast is a memberlist type mb, ok := other.(*memberlistBroadcast) if !ok { return false } // Invalidates any message about the same node return b.node == mb.node } func (b *memberlistBroadcast) Message() []byte { return b.msg } func (b *memberlistBroadcast) Finished() { select { case b.notify <- struct{}{}: default: } } // encodeAndBroadcast encodes a message and enqueues it for broadcast. Fails // silently if there is an encoding error. func (m *Memberlist) encodeAndBroadcast(node string, msgType messageType, msg interface{}) { m.encodeBroadcastNotify(node, msgType, msg, nil) } // encodeBroadcastNotify encodes a message and enqueues it for broadcast // and notifies the given channel when transmission is finished. Fails // silently if there is an encoding error. func (m *Memberlist) encodeBroadcastNotify(node string, msgType messageType, msg interface{}, notify chan struct{}) { buf, err := encode(msgType, msg) if err != nil { m.logger.Printf("[ERR] memberlist: Failed to encode message for broadcast: %s", err) } else { m.queueBroadcast(node, buf.Bytes(), notify) } } // queueBroadcast is used to start dissemination of a message. It will be // sent up to a configured number of times. The message could potentially // be invalidated by a future message about the same node func (m *Memberlist) queueBroadcast(node string, msg []byte, notify chan struct{}) { b := &memberlistBroadcast{node, msg, notify} m.broadcasts.QueueBroadcast(b) } // getBroadcasts is used to return a slice of broadcasts to send up to // a maximum byte size, while imposing a per-broadcast overhead. This is used // to fill a UDP packet with piggybacked data func (m *Memberlist) getBroadcasts(overhead, limit int) [][]byte { // Get memberlist messages first toSend := m.broadcasts.GetBroadcasts(overhead, limit) // Check if the user has anything to broadcast d := m.config.Delegate if d != nil { // Determine the bytes used already bytesUsed := 0 for _, msg := range toSend { bytesUsed += len(msg) + overhead } // Check space remaining for user messages avail := limit - bytesUsed if avail > overhead+userMsgOverhead { userMsgs := d.GetBroadcasts(overhead+userMsgOverhead, avail) // Frame each user message for _, msg := range userMsgs { buf := make([]byte, 1, len(msg)+1) buf[0] = byte(userMsg) buf = append(buf, msg...) toSend = append(toSend, buf) } } } return toSend } memberlist-0.1.0/broadcast_test.go000066400000000000000000000011021307374264600172110ustar00rootroot00000000000000package memberlist import ( "reflect" "testing" ) func TestMemberlistBroadcast_Invalidates(t *testing.T) { m1 := &memberlistBroadcast{"test", nil, nil} m2 := &memberlistBroadcast{"foo", nil, nil} if m1.Invalidates(m2) || m2.Invalidates(m1) { t.Fatalf("unexpected invalidation") } if !m1.Invalidates(m1) { t.Fatalf("expected invalidation") } } func TestMemberlistBroadcast_Message(t *testing.T) { m1 := &memberlistBroadcast{"test", []byte("test"), nil} msg := m1.Message() if !reflect.DeepEqual(msg, []byte("test")) { t.Fatalf("messages do not match") } } memberlist-0.1.0/config.go000066400000000000000000000305031307374264600154640ustar00rootroot00000000000000package memberlist import ( "io" "log" "os" "time" ) type Config struct { // The name of this node. This must be unique in the cluster. Name string // Transport is a hook for providing custom code to communicate with // other nodes. If this is left nil, then memberlist will by default // make a NetTransport using BindAddr and BindPort from this structure. Transport Transport // Configuration related to what address to bind to and ports to // listen on. The port is used for both UDP and TCP gossip. It is // assumed other nodes are running on this port, but they do not need // to. BindAddr string BindPort int // Configuration related to what address to advertise to other // cluster members. Used for nat traversal. AdvertiseAddr string AdvertisePort int // ProtocolVersion is the configured protocol version that we // will _speak_. This must be between ProtocolVersionMin and // ProtocolVersionMax. ProtocolVersion uint8 // TCPTimeout is the timeout for establishing a stream connection with // a remote node for a full state sync, and for stream read and write // operations. This is a legacy name for backwards compatibility, but // should really be called StreamTimeout now that we have generalized // the transport. TCPTimeout time.Duration // IndirectChecks is the number of nodes that will be asked to perform // an indirect probe of a node in the case a direct probe fails. Memberlist // waits for an ack from any single indirect node, so increasing this // number will increase the likelihood that an indirect probe will succeed // at the expense of bandwidth. IndirectChecks int // RetransmitMult is the multiplier for the number of retransmissions // that are attempted for messages broadcasted over gossip. The actual // count of retransmissions is calculated using the formula: // // Retransmits = RetransmitMult * log(N+1) // // This allows the retransmits to scale properly with cluster size. The // higher the multiplier, the more likely a failed broadcast is to converge // at the expense of increased bandwidth. RetransmitMult int // SuspicionMult is the multiplier for determining the time an // inaccessible node is considered suspect before declaring it dead. // The actual timeout is calculated using the formula: // // SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval // // This allows the timeout to scale properly with expected propagation // delay with a larger cluster size. The higher the multiplier, the longer // an inaccessible node is considered part of the cluster before declaring // it dead, giving that suspect node more time to refute if it is indeed // still alive. SuspicionMult int // SuspicionMaxTimeoutMult is the multiplier applied to the // SuspicionTimeout used as an upper bound on detection time. This max // timeout is calculated using the formula: // // SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout // // If everything is working properly, confirmations from other nodes will // accelerate suspicion timers in a manner which will cause the timeout // to reach the base SuspicionTimeout before that elapses, so this value // will typically only come into play if a node is experiencing issues // communicating with other nodes. It should be set to a something fairly // large so that a node having problems will have a lot of chances to // recover before falsely declaring other nodes as failed, but short // enough for a legitimately isolated node to still make progress marking // nodes failed in a reasonable amount of time. SuspicionMaxTimeoutMult int // PushPullInterval is the interval between complete state syncs. // Complete state syncs are done with a single node over TCP and are // quite expensive relative to standard gossiped messages. Setting this // to zero will disable state push/pull syncs completely. // // Setting this interval lower (more frequent) will increase convergence // speeds across larger clusters at the expense of increased bandwidth // usage. PushPullInterval time.Duration // ProbeInterval and ProbeTimeout are used to configure probing // behavior for memberlist. // // ProbeInterval is the interval between random node probes. Setting // this lower (more frequent) will cause the memberlist cluster to detect // failed nodes more quickly at the expense of increased bandwidth usage. // // ProbeTimeout is the timeout to wait for an ack from a probed node // before assuming it is unhealthy. This should be set to 99-percentile // of RTT (round-trip time) on your network. ProbeInterval time.Duration ProbeTimeout time.Duration // DisableTcpPings will turn off the fallback TCP pings that are attempted // if the direct UDP ping fails. These get pipelined along with the // indirect UDP pings. DisableTcpPings bool // AwarenessMaxMultiplier will increase the probe interval if the node // becomes aware that it might be degraded and not meeting the soft real // time requirements to reliably probe other nodes. AwarenessMaxMultiplier int // GossipInterval and GossipNodes are used to configure the gossip // behavior of memberlist. // // GossipInterval is the interval between sending messages that need // to be gossiped that haven't been able to piggyback on probing messages. // If this is set to zero, non-piggyback gossip is disabled. By lowering // this value (more frequent) gossip messages are propagated across // the cluster more quickly at the expense of increased bandwidth. // // GossipNodes is the number of random nodes to send gossip messages to // per GossipInterval. Increasing this number causes the gossip messages // to propagate across the cluster more quickly at the expense of // increased bandwidth. // // GossipToTheDeadTime is the interval after which a node has died that // we will still try to gossip to it. This gives it a chance to refute. GossipInterval time.Duration GossipNodes int GossipToTheDeadTime time.Duration // EnableCompression is used to control message compression. This can // be used to reduce bandwidth usage at the cost of slightly more CPU // utilization. This is only available starting at protocol version 1. EnableCompression bool // SecretKey is used to initialize the primary encryption key in a keyring. // The primary encryption key is the only key used to encrypt messages and // the first key used while attempting to decrypt messages. Providing a // value for this primary key will enable message-level encryption and // verification, and automatically install the key onto the keyring. // The value should be either 16, 24, or 32 bytes to select AES-128, // AES-192, or AES-256. SecretKey []byte // The keyring holds all of the encryption keys used internally. It is // automatically initialized using the SecretKey and SecretKeys values. Keyring *Keyring // Delegate and Events are delegates for receiving and providing // data to memberlist via callback mechanisms. For Delegate, see // the Delegate interface. For Events, see the EventDelegate interface. // // The DelegateProtocolMin/Max are used to guarantee protocol-compatibility // for any custom messages that the delegate might do (broadcasts, // local/remote state, etc.). If you don't set these, then the protocol // versions will just be zero, and version compliance won't be done. Delegate Delegate DelegateProtocolVersion uint8 DelegateProtocolMin uint8 DelegateProtocolMax uint8 Events EventDelegate Conflict ConflictDelegate Merge MergeDelegate Ping PingDelegate Alive AliveDelegate // DNSConfigPath points to the system's DNS config file, usually located // at /etc/resolv.conf. It can be overridden via config for easier testing. DNSConfigPath string // LogOutput is the writer where logs should be sent. If this is not // set, logging will go to stderr by default. You cannot specify both LogOutput // and Logger at the same time. LogOutput io.Writer // Logger is a custom logger which you provide. If Logger is set, it will use // this for the internal logger. If Logger is not set, it will fall back to the // behavior for using LogOutput. You cannot specify both LogOutput and Logger // at the same time. Logger *log.Logger // Size of Memberlist's internal channel which handles UDP messages. The // size of this determines the size of the queue which Memberlist will keep // while UDP messages are handled. HandoffQueueDepth int // Maximum number of bytes that memberlist will put in a packet (this // will be for UDP packets by default with a NetTransport). A safe value // for this is typically 1400 bytes (which is the default). However, // depending on your network's MTU (Maximum Transmission Unit) you may // be able to increase this to get more content into each gossip packet. // This is a legacy name for backward compatibility but should really be // called PacketBufferSize now that we have generalized the transport. UDPBufferSize int } // DefaultLANConfig returns a sane set of configurations for Memberlist. // It uses the hostname as the node name, and otherwise sets very conservative // values that are sane for most LAN environments. The default configuration // errs on the side of caution, choosing values that are optimized // for higher convergence at the cost of higher bandwidth usage. Regardless, // these values are a good starting point when getting started with memberlist. func DefaultLANConfig() *Config { hostname, _ := os.Hostname() return &Config{ Name: hostname, BindAddr: "0.0.0.0", BindPort: 7946, AdvertiseAddr: "", AdvertisePort: 7946, ProtocolVersion: ProtocolVersion2Compatible, TCPTimeout: 10 * time.Second, // Timeout after 10 seconds IndirectChecks: 3, // Use 3 nodes for the indirect ping RetransmitMult: 4, // Retransmit a message 4 * log(N+1) nodes SuspicionMult: 5, // Suspect a node for 5 * log(N+1) * Interval SuspicionMaxTimeoutMult: 6, // For 10k nodes this will give a max timeout of 120 seconds PushPullInterval: 30 * time.Second, // Low frequency ProbeTimeout: 500 * time.Millisecond, // Reasonable RTT time for LAN ProbeInterval: 1 * time.Second, // Failure check every second DisableTcpPings: false, // TCP pings are safe, even with mixed versions AwarenessMaxMultiplier: 8, // Probe interval backs off to 8 seconds GossipNodes: 3, // Gossip to 3 nodes GossipInterval: 200 * time.Millisecond, // Gossip more rapidly GossipToTheDeadTime: 30 * time.Second, // Same as push/pull EnableCompression: true, // Enable compression by default SecretKey: nil, Keyring: nil, DNSConfigPath: "/etc/resolv.conf", HandoffQueueDepth: 1024, UDPBufferSize: 1400, } } // DefaultWANConfig works like DefaultConfig, however it returns a configuration // that is optimized for most WAN environments. The default configuration is // still very conservative and errs on the side of caution. func DefaultWANConfig() *Config { conf := DefaultLANConfig() conf.TCPTimeout = 30 * time.Second conf.SuspicionMult = 6 conf.PushPullInterval = 60 * time.Second conf.ProbeTimeout = 3 * time.Second conf.ProbeInterval = 5 * time.Second conf.GossipNodes = 4 // Gossip less frequently, but to an additional node conf.GossipInterval = 500 * time.Millisecond conf.GossipToTheDeadTime = 60 * time.Second return conf } // DefaultLocalConfig works like DefaultConfig, however it returns a configuration // that is optimized for a local loopback environments. The default configuration is // still very conservative and errs on the side of caution. func DefaultLocalConfig() *Config { conf := DefaultLANConfig() conf.TCPTimeout = time.Second conf.IndirectChecks = 1 conf.RetransmitMult = 2 conf.SuspicionMult = 3 conf.PushPullInterval = 15 * time.Second conf.ProbeTimeout = 200 * time.Millisecond conf.ProbeInterval = time.Second conf.GossipInterval = 100 * time.Millisecond conf.GossipToTheDeadTime = 15 * time.Second return conf } // Returns whether or not encryption is enabled func (c *Config) EncryptionEnabled() bool { return c.Keyring != nil && len(c.Keyring.GetKeys()) > 0 } memberlist-0.1.0/conflict_delegate.go000066400000000000000000000005701307374264600176530ustar00rootroot00000000000000package memberlist // ConflictDelegate is a used to inform a client that // a node has attempted to join which would result in a // name conflict. This happens if two clients are configured // with the same name but different addresses. type ConflictDelegate interface { // NotifyConflict is invoked when a name conflict is detected NotifyConflict(existing, other *Node) } memberlist-0.1.0/delegate.go000066400000000000000000000034731307374264600157770ustar00rootroot00000000000000package memberlist // Delegate is the interface that clients must implement if they want to hook // into the gossip layer of Memberlist. All the methods must be thread-safe, // as they can and generally will be called concurrently. type Delegate interface { // NodeMeta is used to retrieve meta-data about the current node // when broadcasting an alive message. It's length is limited to // the given byte size. This metadata is available in the Node structure. NodeMeta(limit int) []byte // NotifyMsg is called when a user-data message is received. // Care should be taken that this method does not block, since doing // so would block the entire UDP packet receive loop. Additionally, the byte // slice may be modified after the call returns, so it should be copied if needed NotifyMsg([]byte) // GetBroadcasts is called when user data messages can be broadcast. // It can return a list of buffers to send. Each buffer should assume an // overhead as provided with a limit on the total byte size allowed. // The total byte size of the resulting data to send must not exceed // the limit. Care should be taken that this method does not block, // since doing so would block the entire UDP packet receive loop. GetBroadcasts(overhead, limit int) [][]byte // LocalState is used for a TCP Push/Pull. This is sent to // the remote side in addition to the membership information. Any // data can be sent here. See MergeRemoteState as well. The `join` // boolean indicates this is for a join instead of a push/pull. LocalState(join bool) []byte // MergeRemoteState is invoked after a TCP Push/Pull. This is the // state received from the remote side and is the result of the // remote side's LocalState call. The 'join' // boolean indicates this is for a join instead of a push/pull. MergeRemoteState(buf []byte, join bool) } memberlist-0.1.0/event_delegate.go000066400000000000000000000035161307374264600171760ustar00rootroot00000000000000package memberlist // EventDelegate is a simpler delegate that is used only to receive // notifications about members joining and leaving. The methods in this // delegate may be called by multiple goroutines, but never concurrently. // This allows you to reason about ordering. type EventDelegate interface { // NotifyJoin is invoked when a node is detected to have joined. // The Node argument must not be modified. NotifyJoin(*Node) // NotifyLeave is invoked when a node is detected to have left. // The Node argument must not be modified. NotifyLeave(*Node) // NotifyUpdate is invoked when a node is detected to have // updated, usually involving the meta data. The Node argument // must not be modified. NotifyUpdate(*Node) } // ChannelEventDelegate is used to enable an application to receive // events about joins and leaves over a channel instead of a direct // function call. // // Care must be taken that events are processed in a timely manner from // the channel, since this delegate will block until an event can be sent. type ChannelEventDelegate struct { Ch chan<- NodeEvent } // NodeEventType are the types of events that can be sent from the // ChannelEventDelegate. type NodeEventType int const ( NodeJoin NodeEventType = iota NodeLeave NodeUpdate ) // NodeEvent is a single event related to node activity in the memberlist. // The Node member of this struct must not be directly modified. It is passed // as a pointer to avoid unnecessary copies. If you wish to modify the node, // make a copy first. type NodeEvent struct { Event NodeEventType Node *Node } func (c *ChannelEventDelegate) NotifyJoin(n *Node) { c.Ch <- NodeEvent{NodeJoin, n} } func (c *ChannelEventDelegate) NotifyLeave(n *Node) { c.Ch <- NodeEvent{NodeLeave, n} } func (c *ChannelEventDelegate) NotifyUpdate(n *Node) { c.Ch <- NodeEvent{NodeUpdate, n} } memberlist-0.1.0/integ_test.go000066400000000000000000000036321307374264600163670ustar00rootroot00000000000000package memberlist import ( "fmt" "log" "os" "testing" "time" ) // CheckInteg will skip a test if integration testing is not enabled. func CheckInteg(t *testing.T) { if !IsInteg() { t.SkipNow() } } // IsInteg returns a boolean telling you if we're in integ testing mode. func IsInteg() bool { return os.Getenv("INTEG_TESTS") != "" } // Tests the memberlist by creating a cluster of 100 nodes // and checking that we get strong convergence of changes. func TestMemberlist_Integ(t *testing.T) { CheckInteg(t) num := 16 var members []*Memberlist secret := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} eventCh := make(chan NodeEvent, num) addr := "127.0.0.1" for i := 0; i < num; i++ { c := DefaultLANConfig() c.Name = fmt.Sprintf("%s:%d", addr, 12345+i) c.BindAddr = addr c.BindPort = 12345 + i c.ProbeInterval = 20 * time.Millisecond c.ProbeTimeout = 100 * time.Millisecond c.GossipInterval = 20 * time.Millisecond c.PushPullInterval = 200 * time.Millisecond c.SecretKey = secret if i == 0 { c.Events = &ChannelEventDelegate{eventCh} } m, err := Create(c) if err != nil { t.Fatalf("unexpected err: %s", err) } members = append(members, m) defer m.Shutdown() if i > 0 { last := members[i-1] num, err := m.Join([]string{last.config.Name}) if num == 0 || err != nil { t.Fatalf("unexpected err: %s", err) } } } // Wait and print debug info breakTimer := time.After(250 * time.Millisecond) WAIT: for { select { case e := <-eventCh: if e.Event == NodeJoin { log.Printf("[DEBUG] Node join: %v (%d)", *e.Node, members[0].NumMembers()) } else { log.Printf("[DEBUG] Node leave: %v (%d)", *e.Node, members[0].NumMembers()) } case <-breakTimer: break WAIT } } for idx, m := range members { got := m.NumMembers() if got != num { t.Errorf("bad num members at idx %d. Expected %d. Got %d.", idx, num, got) } } } memberlist-0.1.0/keyring.go000066400000000000000000000106311307374264600156670ustar00rootroot00000000000000package memberlist import ( "bytes" "fmt" "sync" ) type Keyring struct { // Keys stores the key data used during encryption and decryption. It is // ordered in such a way where the first key (index 0) is the primary key, // which is used for encrypting messages, and is the first key tried during // message decryption. keys [][]byte // The keyring lock is used while performing IO operations on the keyring. l sync.Mutex } // Init allocates substructures func (k *Keyring) init() { k.keys = make([][]byte, 0) } // NewKeyring constructs a new container for a set of encryption keys. The // keyring contains all key data used internally by memberlist. // // While creating a new keyring, you must do one of: // - Omit keys and primary key, effectively disabling encryption // - Pass a set of keys plus the primary key // - Pass only a primary key // // If only a primary key is passed, then it will be automatically added to the // keyring. If creating a keyring with multiple keys, one key must be designated // primary by passing it as the primaryKey. If the primaryKey does not exist in // the list of secondary keys, it will be automatically added at position 0. // // A key should be either 16, 24, or 32 bytes to select AES-128, // AES-192, or AES-256. func NewKeyring(keys [][]byte, primaryKey []byte) (*Keyring, error) { keyring := &Keyring{} keyring.init() if len(keys) > 0 || len(primaryKey) > 0 { if len(primaryKey) == 0 { return nil, fmt.Errorf("Empty primary key not allowed") } if err := keyring.AddKey(primaryKey); err != nil { return nil, err } for _, key := range keys { if err := keyring.AddKey(key); err != nil { return nil, err } } } return keyring, nil } // ValidateKey will check to see if the key is valid and returns an error if not. // // key should be either 16, 24, or 32 bytes to select AES-128, // AES-192, or AES-256. func ValidateKey(key []byte) error { if l := len(key); l != 16 && l != 24 && l != 32 { return fmt.Errorf("key size must be 16, 24 or 32 bytes") } return nil } // AddKey will install a new key on the ring. Adding a key to the ring will make // it available for use in decryption. If the key already exists on the ring, // this function will just return noop. // // key should be either 16, 24, or 32 bytes to select AES-128, // AES-192, or AES-256. func (k *Keyring) AddKey(key []byte) error { if err := ValidateKey(key); err != nil { return err } // No-op if key is already installed for _, installedKey := range k.keys { if bytes.Equal(installedKey, key) { return nil } } keys := append(k.keys, key) primaryKey := k.GetPrimaryKey() if primaryKey == nil { primaryKey = key } k.installKeys(keys, primaryKey) return nil } // UseKey changes the key used to encrypt messages. This is the only key used to // encrypt messages, so peers should know this key before this method is called. func (k *Keyring) UseKey(key []byte) error { for _, installedKey := range k.keys { if bytes.Equal(key, installedKey) { k.installKeys(k.keys, key) return nil } } return fmt.Errorf("Requested key is not in the keyring") } // RemoveKey drops a key from the keyring. This will return an error if the key // requested for removal is currently at position 0 (primary key). func (k *Keyring) RemoveKey(key []byte) error { if bytes.Equal(key, k.keys[0]) { return fmt.Errorf("Removing the primary key is not allowed") } for i, installedKey := range k.keys { if bytes.Equal(key, installedKey) { keys := append(k.keys[:i], k.keys[i+1:]...) k.installKeys(keys, k.keys[0]) } } return nil } // installKeys will take out a lock on the keyring, and replace the keys with a // new set of keys. The key indicated by primaryKey will be installed as the new // primary key. func (k *Keyring) installKeys(keys [][]byte, primaryKey []byte) { k.l.Lock() defer k.l.Unlock() newKeys := [][]byte{primaryKey} for _, key := range keys { if !bytes.Equal(key, primaryKey) { newKeys = append(newKeys, key) } } k.keys = newKeys } // GetKeys returns the current set of keys on the ring. func (k *Keyring) GetKeys() [][]byte { k.l.Lock() defer k.l.Unlock() return k.keys } // GetPrimaryKey returns the key on the ring at position 0. This is the key used // for encrypting messages, and is the first key tried for decrypting messages. func (k *Keyring) GetPrimaryKey() (key []byte) { k.l.Lock() defer k.l.Unlock() if len(k.keys) > 0 { key = k.keys[0] } return } memberlist-0.1.0/keyring_test.go000066400000000000000000000072011307374264600167250ustar00rootroot00000000000000package memberlist import ( "bytes" "testing" ) var TestKeys [][]byte = [][]byte{ []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, []byte{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, []byte{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, } func TestKeyring_EmptyRing(t *testing.T) { // Keyrings can be created with no encryption keys (disabled encryption) keyring, err := NewKeyring(nil, nil) if err != nil { t.Fatalf("err: %s", err) } keys := keyring.GetKeys() if len(keys) != 0 { t.Fatalf("Expected 0 keys but have %d", len(keys)) } } func TestKeyring_PrimaryOnly(t *testing.T) { // Keyrings can be created using only a primary key keyring, err := NewKeyring(nil, TestKeys[0]) if err != nil { t.Fatalf("err: %s", err) } keys := keyring.GetKeys() if len(keys) != 1 { t.Fatalf("Expected 1 key but have %d", len(keys)) } } func TestKeyring_GetPrimaryKey(t *testing.T) { keyring, err := NewKeyring(TestKeys, TestKeys[1]) if err != nil { t.Fatalf("err: %s", err) } // GetPrimaryKey returns correct key primaryKey := keyring.GetPrimaryKey() if !bytes.Equal(primaryKey, TestKeys[1]) { t.Fatalf("Unexpected primary key: %v", primaryKey) } } func TestKeyring_AddRemoveUse(t *testing.T) { keyring, err := NewKeyring(nil, TestKeys[1]) if err != nil { t.Fatalf("err :%s", err) } // Use non-existent key throws error if err := keyring.UseKey(TestKeys[2]); err == nil { t.Fatalf("Expected key not installed error") } // Add key to ring if err := keyring.AddKey(TestKeys[2]); err != nil { t.Fatalf("err: %s", err) } keys := keyring.GetKeys() if !bytes.Equal(keys[0], TestKeys[1]) { t.Fatalf("Unexpected primary key change") } if len(keys) != 2 { t.Fatalf("Expected 2 keys but have %d", len(keys)) } // Use key that exists should succeed if err := keyring.UseKey(TestKeys[2]); err != nil { t.Fatalf("err: %s", err) } primaryKey := keyring.GetPrimaryKey() if !bytes.Equal(primaryKey, TestKeys[2]) { t.Fatalf("Unexpected primary key: %v", primaryKey) } // Removing primary key should fail if err := keyring.RemoveKey(TestKeys[2]); err == nil { t.Fatalf("Expected primary key removal error") } // Removing non-primary key should succeed if err := keyring.RemoveKey(TestKeys[1]); err != nil { t.Fatalf("err: %s", err) } keys = keyring.GetKeys() if len(keys) != 1 { t.Fatalf("Expected 1 key but have %d", len(keys)) } } func TestKeyRing_MultiKeyEncryptDecrypt(t *testing.T) { plaintext := []byte("this is a plain text message") extra := []byte("random data") keyring, err := NewKeyring(TestKeys, TestKeys[0]) if err != nil { t.Fatalf("err: %s", err) } // First encrypt using the primary key and make sure we can decrypt var buf bytes.Buffer err = encryptPayload(1, TestKeys[0], plaintext, extra, &buf) if err != nil { t.Fatalf("err: %v", err) } msg, err := decryptPayload(keyring.GetKeys(), buf.Bytes(), extra) if err != nil { t.Fatalf("err: %v", err) } if !bytes.Equal(msg, plaintext) { t.Fatalf("bad: %v", msg) } // Now encrypt with a secondary key and try decrypting again. buf.Reset() err = encryptPayload(1, TestKeys[2], plaintext, extra, &buf) if err != nil { t.Fatalf("err: %v", err) } msg, err = decryptPayload(keyring.GetKeys(), buf.Bytes(), extra) if err != nil { t.Fatalf("err: %v", err) } if !bytes.Equal(msg, plaintext) { t.Fatalf("bad: %v", msg) } // Remove a key from the ring, and then try decrypting again if err := keyring.RemoveKey(TestKeys[2]); err != nil { t.Fatalf("err: %s", err) } msg, err = decryptPayload(keyring.GetKeys(), buf.Bytes(), extra) if err == nil { t.Fatalf("Expected no keys to decrypt message") } } memberlist-0.1.0/logging.go000066400000000000000000000004731307374264600156500ustar00rootroot00000000000000package memberlist import ( "fmt" "net" ) func LogAddress(addr net.Addr) string { if addr == nil { return "from=" } return fmt.Sprintf("from=%s", addr.String()) } func LogConn(conn net.Conn) string { if conn == nil { return LogAddress(nil) } return LogAddress(conn.RemoteAddr()) } memberlist-0.1.0/logging_test.go000066400000000000000000000014411307374264600167030ustar00rootroot00000000000000package memberlist import ( "fmt" "net" "testing" ) func TestLogging_Address(t *testing.T) { s := LogAddress(nil) if s != "from=" { t.Fatalf("bad: %s", s) } addr, err := net.ResolveIPAddr("ip4", "127.0.0.1") if err != nil { t.Fatalf("err: %v", err) } s = LogAddress(addr) if s != "from=127.0.0.1" { t.Fatalf("bad: %s", s) } } func TestLogging_Conn(t *testing.T) { s := LogConn(nil) if s != "from=" { t.Fatalf("bad: %s", s) } ln, err := net.Listen("tcp", ":0") if err != nil { t.Fatalf("err: %v", err) } conn, err := net.Dial("tcp", ln.Addr().String()) if err != nil { t.Fatalf("err: %v", err) } defer conn.Close() s = LogConn(conn) if s != fmt.Sprintf("from=%s", conn.RemoteAddr().String()) { t.Fatalf("bad: %s", s) } } memberlist-0.1.0/memberlist.go000066400000000000000000000435641307374264600163750ustar00rootroot00000000000000/* memberlist is a library that manages cluster membership and member failure detection using a gossip based protocol. The use cases for such a library are far-reaching: all distributed systems require membership, and memberlist is a re-usable solution to managing cluster membership and node failure detection. memberlist is eventually consistent but converges quickly on average. The speed at which it converges can be heavily tuned via various knobs on the protocol. Node failures are detected and network partitions are partially tolerated by attempting to communicate to potentially dead nodes through multiple routes. */ package memberlist import ( "fmt" "log" "net" "os" "strconv" "strings" "sync" "time" "github.com/hashicorp/go-multierror" sockaddr "github.com/hashicorp/go-sockaddr" "github.com/miekg/dns" ) type Memberlist struct { sequenceNum uint32 // Local sequence number incarnation uint32 // Local incarnation number numNodes uint32 // Number of known nodes (estimate) config *Config shutdown bool shutdownCh chan struct{} leave bool leaveBroadcast chan struct{} transport Transport handoff chan msgHandoff nodeLock sync.RWMutex nodes []*nodeState // Known nodes nodeMap map[string]*nodeState // Maps Addr.String() -> NodeState nodeTimers map[string]*suspicion // Maps Addr.String() -> suspicion timer awareness *awareness tickerLock sync.Mutex tickers []*time.Ticker stopTick chan struct{} probeIndex int ackLock sync.Mutex ackHandlers map[uint32]*ackHandler broadcasts *TransmitLimitedQueue logger *log.Logger } // newMemberlist creates the network listeners. // Does not schedule execution of background maintenance. func newMemberlist(conf *Config) (*Memberlist, error) { if conf.ProtocolVersion < ProtocolVersionMin { return nil, fmt.Errorf("Protocol version '%d' too low. Must be in range: [%d, %d]", conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) } else if conf.ProtocolVersion > ProtocolVersionMax { return nil, fmt.Errorf("Protocol version '%d' too high. Must be in range: [%d, %d]", conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) } if len(conf.SecretKey) > 0 { if conf.Keyring == nil { keyring, err := NewKeyring(nil, conf.SecretKey) if err != nil { return nil, err } conf.Keyring = keyring } else { if err := conf.Keyring.AddKey(conf.SecretKey); err != nil { return nil, err } if err := conf.Keyring.UseKey(conf.SecretKey); err != nil { return nil, err } } } if conf.LogOutput != nil && conf.Logger != nil { return nil, fmt.Errorf("Cannot specify both LogOutput and Logger. Please choose a single log configuration setting.") } logDest := conf.LogOutput if logDest == nil { logDest = os.Stderr } logger := conf.Logger if logger == nil { logger = log.New(logDest, "", log.LstdFlags) } // Set up a network transport by default if a custom one wasn't given // by the config. transport := conf.Transport if transport == nil { nc := &NetTransportConfig{ BindAddrs: []string{conf.BindAddr}, BindPort: conf.BindPort, Logger: logger, } nt, err := NewNetTransport(nc) if err != nil { return nil, fmt.Errorf("Could not set up network transport: %v", err) } if conf.BindPort == 0 { port := nt.GetAutoBindPort() conf.BindPort = port logger.Printf("[DEBUG] Using dynamic bind port %d", port) } transport = nt } m := &Memberlist{ config: conf, shutdownCh: make(chan struct{}), leaveBroadcast: make(chan struct{}, 1), transport: transport, handoff: make(chan msgHandoff, conf.HandoffQueueDepth), nodeMap: make(map[string]*nodeState), nodeTimers: make(map[string]*suspicion), awareness: newAwareness(conf.AwarenessMaxMultiplier), ackHandlers: make(map[uint32]*ackHandler), broadcasts: &TransmitLimitedQueue{RetransmitMult: conf.RetransmitMult}, logger: logger, } m.broadcasts.NumNodes = func() int { return m.estNumNodes() } go m.streamListen() go m.packetListen() go m.packetHandler() return m, nil } // Create will create a new Memberlist using the given configuration. // This will not connect to any other node (see Join) yet, but will start // all the listeners to allow other nodes to join this memberlist. // After creating a Memberlist, the configuration given should not be // modified by the user anymore. func Create(conf *Config) (*Memberlist, error) { m, err := newMemberlist(conf) if err != nil { return nil, err } if err := m.setAlive(); err != nil { m.Shutdown() return nil, err } m.schedule() return m, nil } // Join is used to take an existing Memberlist and attempt to join a cluster // by contacting all the given hosts and performing a state sync. Initially, // the Memberlist only contains our own state, so doing this will cause // remote nodes to become aware of the existence of this node, effectively // joining the cluster. // // This returns the number of hosts successfully contacted and an error if // none could be reached. If an error is returned, the node did not successfully // join the cluster. func (m *Memberlist) Join(existing []string) (int, error) { numSuccess := 0 var errs error for _, exist := range existing { addrs, err := m.resolveAddr(exist) if err != nil { err = fmt.Errorf("Failed to resolve %s: %v", exist, err) errs = multierror.Append(errs, err) m.logger.Printf("[WARN] memberlist: %v", err) continue } for _, addr := range addrs { hp := joinHostPort(addr.ip.String(), addr.port) if err := m.pushPullNode(hp, true); err != nil { err = fmt.Errorf("Failed to join %s: %v", addr.ip, err) errs = multierror.Append(errs, err) m.logger.Printf("[DEBUG] memberlist: %v", err) continue } numSuccess++ } } if numSuccess > 0 { errs = nil } return numSuccess, errs } // ipPort holds information about a node we want to try to join. type ipPort struct { ip net.IP port uint16 } // tcpLookupIP is a helper to initiate a TCP-based DNS lookup for the given host. // The built-in Go resolver will do a UDP lookup first, and will only use TCP if // the response has the truncate bit set, which isn't common on DNS servers like // Consul's. By doing the TCP lookup directly, we get the best chance for the // largest list of hosts to join. Since joins are relatively rare events, it's ok // to do this rather expensive operation. func (m *Memberlist) tcpLookupIP(host string, defaultPort uint16) ([]ipPort, error) { // Don't attempt any TCP lookups against non-fully qualified domain // names, since those will likely come from the resolv.conf file. if !strings.Contains(host, ".") { return nil, nil } // Make sure the domain name is terminated with a dot (we know there's // at least one character at this point). dn := host if dn[len(dn)-1] != '.' { dn = dn + "." } // See if we can find a server to try. cc, err := dns.ClientConfigFromFile(m.config.DNSConfigPath) if err != nil { return nil, err } if len(cc.Servers) > 0 { // We support host:port in the DNS config, but need to add the // default port if one is not supplied. server := cc.Servers[0] if !hasPort(server) { server = net.JoinHostPort(server, cc.Port) } // Do the lookup. c := new(dns.Client) c.Net = "tcp" msg := new(dns.Msg) msg.SetQuestion(dn, dns.TypeANY) in, _, err := c.Exchange(msg, server) if err != nil { return nil, err } // Handle any IPs we get back that we can attempt to join. var ips []ipPort for _, r := range in.Answer { switch rr := r.(type) { case (*dns.A): ips = append(ips, ipPort{rr.A, defaultPort}) case (*dns.AAAA): ips = append(ips, ipPort{rr.AAAA, defaultPort}) case (*dns.CNAME): m.logger.Printf("[DEBUG] memberlist: Ignoring CNAME RR in TCP-first answer for '%s'", host) } } return ips, nil } return nil, nil } // resolveAddr is used to resolve the address into an address, // port, and error. If no port is given, use the default func (m *Memberlist) resolveAddr(hostStr string) ([]ipPort, error) { // Normalize the incoming string to host:port so we can apply Go's // parser to it. port := uint16(0) if !hasPort(hostStr) { hostStr += ":" + strconv.Itoa(m.config.BindPort) } host, sport, err := net.SplitHostPort(hostStr) if err != nil { return nil, err } // This will capture the supplied port, or the default one added above. lport, err := strconv.ParseUint(sport, 10, 16) if err != nil { return nil, err } port = uint16(lport) // If it looks like an IP address we are done. The SplitHostPort() above // will make sure the host part is in good shape for parsing, even for // IPv6 addresses. if ip := net.ParseIP(host); ip != nil { return []ipPort{ipPort{ip, port}}, nil } // First try TCP so we have the best chance for the largest list of // hosts to join. If this fails it's not fatal since this isn't a standard // way to query DNS, and we have a fallback below. ips, err := m.tcpLookupIP(host, port) if err != nil { m.logger.Printf("[DEBUG] memberlist: TCP-first lookup failed for '%s', falling back to UDP: %s", hostStr, err) } if len(ips) > 0 { return ips, nil } // If TCP didn't yield anything then use the normal Go resolver which // will try UDP, then might possibly try TCP again if the UDP response // indicates it was truncated. ans, err := net.LookupIP(host) if err != nil { return nil, err } ips = make([]ipPort, 0, len(ans)) for _, ip := range ans { ips = append(ips, ipPort{ip, port}) } return ips, nil } // setAlive is used to mark this node as being alive. This is the same // as if we received an alive notification our own network channel for // ourself. func (m *Memberlist) setAlive() error { // Get the final advertise address from the transport, which may need // to see which address we bound to. addr, port, err := m.transport.FinalAdvertiseAddr( m.config.AdvertiseAddr, m.config.AdvertisePort) if err != nil { return fmt.Errorf("Failed to get final advertise address: %v", err) } // Check if this is a public address without encryption ipAddr, err := sockaddr.NewIPAddr(addr.String()) if err != nil { return fmt.Errorf("Failed to parse interface addresses: %v", err) } ifAddrs := []sockaddr.IfAddr{ sockaddr.IfAddr{ SockAddr: ipAddr, }, } _, publicIfs, err := sockaddr.IfByRFC("6890", ifAddrs) if len(publicIfs) > 0 && !m.config.EncryptionEnabled() { m.logger.Printf("[WARN] memberlist: Binding to public address without encryption!") } // Set any metadata from the delegate. var meta []byte if m.config.Delegate != nil { meta = m.config.Delegate.NodeMeta(MetaMaxSize) if len(meta) > MetaMaxSize { panic("Node meta data provided is longer than the limit") } } a := alive{ Incarnation: m.nextIncarnation(), Node: m.config.Name, Addr: addr, Port: uint16(port), Meta: meta, Vsn: []uint8{ ProtocolVersionMin, ProtocolVersionMax, m.config.ProtocolVersion, m.config.DelegateProtocolMin, m.config.DelegateProtocolMax, m.config.DelegateProtocolVersion, }, } m.aliveNode(&a, nil, true) return nil } // LocalNode is used to return the local Node func (m *Memberlist) LocalNode() *Node { m.nodeLock.RLock() defer m.nodeLock.RUnlock() state := m.nodeMap[m.config.Name] return &state.Node } // UpdateNode is used to trigger re-advertising the local node. This is // primarily used with a Delegate to support dynamic updates to the local // meta data. This will block until the update message is successfully // broadcasted to a member of the cluster, if any exist or until a specified // timeout is reached. func (m *Memberlist) UpdateNode(timeout time.Duration) error { // Get the node meta data var meta []byte if m.config.Delegate != nil { meta = m.config.Delegate.NodeMeta(MetaMaxSize) if len(meta) > MetaMaxSize { panic("Node meta data provided is longer than the limit") } } // Get the existing node m.nodeLock.RLock() state := m.nodeMap[m.config.Name] m.nodeLock.RUnlock() // Format a new alive message a := alive{ Incarnation: m.nextIncarnation(), Node: m.config.Name, Addr: state.Addr, Port: state.Port, Meta: meta, Vsn: []uint8{ ProtocolVersionMin, ProtocolVersionMax, m.config.ProtocolVersion, m.config.DelegateProtocolMin, m.config.DelegateProtocolMax, m.config.DelegateProtocolVersion, }, } notifyCh := make(chan struct{}) m.aliveNode(&a, notifyCh, true) // Wait for the broadcast or a timeout if m.anyAlive() { var timeoutCh <-chan time.Time if timeout > 0 { timeoutCh = time.After(timeout) } select { case <-notifyCh: case <-timeoutCh: return fmt.Errorf("timeout waiting for update broadcast") } } return nil } // SendTo is deprecated in favor of SendBestEffort, which requires a node to // target. func (m *Memberlist) SendTo(to net.Addr, msg []byte) error { // Encode as a user message buf := make([]byte, 1, len(msg)+1) buf[0] = byte(userMsg) buf = append(buf, msg...) // Send the message return m.rawSendMsgPacket(to.String(), nil, buf) } // SendToUDP is deprecated in favor of SendBestEffort. func (m *Memberlist) SendToUDP(to *Node, msg []byte) error { return m.SendBestEffort(to, msg) } // SendToTCP is deprecated in favor of SendReliable. func (m *Memberlist) SendToTCP(to *Node, msg []byte) error { return m.SendReliable(to, msg) } // SendBestEffort uses the unreliable packet-oriented interface of the transport // to target a user message at the given node (this does not use the gossip // mechanism). The maximum size of the message depends on the configured // UDPBufferSize for this memberlist instance. func (m *Memberlist) SendBestEffort(to *Node, msg []byte) error { // Encode as a user message buf := make([]byte, 1, len(msg)+1) buf[0] = byte(userMsg) buf = append(buf, msg...) // Send the message return m.rawSendMsgPacket(to.Address(), to, buf) } // SendReliable uses the reliable stream-oriented interface of the transport to // target a user message at the given node (this does not use the gossip // mechanism). Delivery is guaranteed if no error is returned, and there is no // limit on the size of the message. func (m *Memberlist) SendReliable(to *Node, msg []byte) error { return m.sendUserMsg(to.Address(), msg) } // Members returns a list of all known live nodes. The node structures // returned must not be modified. If you wish to modify a Node, make a // copy first. func (m *Memberlist) Members() []*Node { m.nodeLock.RLock() defer m.nodeLock.RUnlock() nodes := make([]*Node, 0, len(m.nodes)) for _, n := range m.nodes { if n.State != stateDead { nodes = append(nodes, &n.Node) } } return nodes } // NumMembers returns the number of alive nodes currently known. Between // the time of calling this and calling Members, the number of alive nodes // may have changed, so this shouldn't be used to determine how many // members will be returned by Members. func (m *Memberlist) NumMembers() (alive int) { m.nodeLock.RLock() defer m.nodeLock.RUnlock() for _, n := range m.nodes { if n.State != stateDead { alive++ } } return } // Leave will broadcast a leave message but will not shutdown the background // listeners, meaning the node will continue participating in gossip and state // updates. // // This will block until the leave message is successfully broadcasted to // a member of the cluster, if any exist or until a specified timeout // is reached. // // This method is safe to call multiple times, but must not be called // after the cluster is already shut down. func (m *Memberlist) Leave(timeout time.Duration) error { m.nodeLock.Lock() // We can't defer m.nodeLock.Unlock() because m.deadNode will also try to // acquire a lock so we need to Unlock before that. if m.shutdown { m.nodeLock.Unlock() panic("leave after shutdown") } if !m.leave { m.leave = true state, ok := m.nodeMap[m.config.Name] m.nodeLock.Unlock() if !ok { m.logger.Printf("[WARN] memberlist: Leave but we're not in the node map.") return nil } d := dead{ Incarnation: state.Incarnation, Node: state.Name, } m.deadNode(&d) // Block until the broadcast goes out if m.anyAlive() { var timeoutCh <-chan time.Time if timeout > 0 { timeoutCh = time.After(timeout) } select { case <-m.leaveBroadcast: case <-timeoutCh: return fmt.Errorf("timeout waiting for leave broadcast") } } } else { m.nodeLock.Unlock() } return nil } // Check for any other alive node. func (m *Memberlist) anyAlive() bool { m.nodeLock.RLock() defer m.nodeLock.RUnlock() for _, n := range m.nodes { if n.State != stateDead && n.Name != m.config.Name { return true } } return false } // GetHealthScore gives this instance's idea of how well it is meeting the soft // real-time requirements of the protocol. Lower numbers are better, and zero // means "totally healthy". func (m *Memberlist) GetHealthScore() int { return m.awareness.GetHealthScore() } // ProtocolVersion returns the protocol version currently in use by // this memberlist. func (m *Memberlist) ProtocolVersion() uint8 { // NOTE: This method exists so that in the future we can control // any locking if necessary, if we change the protocol version at // runtime, etc. return m.config.ProtocolVersion } // Shutdown will stop any background maintanence of network activity // for this memberlist, causing it to appear "dead". A leave message // will not be broadcasted prior, so the cluster being left will have // to detect this node's shutdown using probing. If you wish to more // gracefully exit the cluster, call Leave prior to shutting down. // // This method is safe to call multiple times. func (m *Memberlist) Shutdown() error { m.nodeLock.Lock() defer m.nodeLock.Unlock() if m.shutdown { return nil } // Shut down the transport first, which should block until it's // completely torn down. If we kill the memberlist-side handlers // those I/O handlers might get stuck. m.transport.Shutdown() // Now tear down everything else. m.shutdown = true close(m.shutdownCh) m.deschedule() return nil } memberlist-0.1.0/memberlist_test.go000066400000000000000000000731631307374264600174320ustar00rootroot00000000000000package memberlist import ( "bytes" "fmt" "io/ioutil" "log" "net" "os" "reflect" "strings" "sync" "testing" "time" "github.com/miekg/dns" ) var bindLock sync.Mutex var bindNum byte = 10 func getBindAddr() net.IP { bindLock.Lock() defer bindLock.Unlock() result := net.IPv4(127, 0, 0, bindNum) bindNum++ if bindNum > 255 { bindNum = 10 } return result } func testConfig() *Config { config := DefaultLANConfig() config.BindAddr = getBindAddr().String() config.Name = config.BindAddr return config } func yield() { time.Sleep(5 * time.Millisecond) } type MockDelegate struct { meta []byte msgs [][]byte broadcasts [][]byte state []byte remoteState []byte } func (m *MockDelegate) NodeMeta(limit int) []byte { return m.meta } func (m *MockDelegate) NotifyMsg(msg []byte) { cp := make([]byte, len(msg)) copy(cp, msg) m.msgs = append(m.msgs, cp) } func (m *MockDelegate) GetBroadcasts(overhead, limit int) [][]byte { b := m.broadcasts m.broadcasts = nil return b } func (m *MockDelegate) LocalState(join bool) []byte { return m.state } func (m *MockDelegate) MergeRemoteState(s []byte, join bool) { m.remoteState = s } // Returns a new Memberlist on an open port by trying a range of port numbers // until something sticks. func NewMemberlistOnOpenPort(c *Config) (*Memberlist, error) { c.BindPort = 0 return newMemberlist(c) } func GetMemberlistDelegate(t *testing.T) (*Memberlist, *MockDelegate) { d := &MockDelegate{} c := testConfig() c.Delegate = d m, err := NewMemberlistOnOpenPort(c) if err != nil { t.Fatalf("failed to start: %v", err) return nil, nil } return m, d } func GetMemberlist(t *testing.T) *Memberlist { c := testConfig() m, err := NewMemberlistOnOpenPort(c) if err != nil { t.Fatalf("failed to start: %v", err) return nil } return m } func TestDefaultLANConfig_protocolVersion(t *testing.T) { c := DefaultLANConfig() if c.ProtocolVersion != ProtocolVersion2Compatible { t.Fatalf("should be max: %d", c.ProtocolVersion) } } func TestCreate_protocolVersion(t *testing.T) { cases := []struct { version uint8 err bool }{ {ProtocolVersionMin, false}, {ProtocolVersionMax, false}, // TODO(mitchellh): uncommon when we're over 0 //{ProtocolVersionMin - 1, true}, {ProtocolVersionMax + 1, true}, {ProtocolVersionMax - 1, false}, } for _, tc := range cases { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() c.ProtocolVersion = tc.version m, err := Create(c) if tc.err && err == nil { t.Errorf("Should've failed with version: %d", tc.version) } else if !tc.err && err != nil { t.Errorf("Version '%d' error: %s", tc.version, err) } if err == nil { m.Shutdown() } } } func TestCreate_secretKey(t *testing.T) { cases := []struct { key []byte err bool }{ {make([]byte, 0), false}, {[]byte("abc"), true}, {make([]byte, 16), false}, {make([]byte, 38), true}, } for _, tc := range cases { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() c.SecretKey = tc.key m, err := Create(c) if tc.err && err == nil { t.Errorf("Should've failed with key: %#v", tc.key) } else if !tc.err && err != nil { t.Errorf("Key '%#v' error: %s", tc.key, err) } if err == nil { m.Shutdown() } } } func TestCreate_secretKeyEmpty(t *testing.T) { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() c.SecretKey = make([]byte, 0) m, err := Create(c) if err != nil { t.Fatalf("err: %s", err) } defer m.Shutdown() if m.config.EncryptionEnabled() { t.Fatalf("Expected encryption to be disabled") } } func TestCreate_keyringOnly(t *testing.T) { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() keyring, err := NewKeyring(nil, make([]byte, 16)) if err != nil { t.Fatalf("err: %s", err) } c.Keyring = keyring m, err := Create(c) if err != nil { t.Fatalf("err: %s", err) } defer m.Shutdown() if !m.config.EncryptionEnabled() { t.Fatalf("Expected encryption to be enabled") } } func TestCreate_keyringAndSecretKey(t *testing.T) { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() keyring, err := NewKeyring(nil, make([]byte, 16)) if err != nil { t.Fatalf("err: %s", err) } c.Keyring = keyring c.SecretKey = []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} m, err := Create(c) if err != nil { t.Fatalf("err: %s", err) } defer m.Shutdown() if !m.config.EncryptionEnabled() { t.Fatalf("Expected encryption to be enabled") } ringKeys := c.Keyring.GetKeys() if !bytes.Equal(c.SecretKey, ringKeys[0]) { t.Fatalf("Unexpected primary key %v", ringKeys[0]) } } func TestCreate_invalidLoggerSettings(t *testing.T) { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() c.Logger = log.New(ioutil.Discard, "", log.LstdFlags) c.LogOutput = ioutil.Discard _, err := Create(c) if err == nil { t.Fatal("Memberlist should not allow both LogOutput and Logger to be set, but it did not raise an error") } } func TestCreate(t *testing.T) { c := testConfig() c.ProtocolVersion = ProtocolVersionMin c.DelegateProtocolVersion = 13 c.DelegateProtocolMin = 12 c.DelegateProtocolMax = 24 m, err := Create(c) if err != nil { t.Fatalf("err: %s", err) } defer m.Shutdown() yield() members := m.Members() if len(members) != 1 { t.Fatalf("bad number of members") } if members[0].PMin != ProtocolVersionMin { t.Fatalf("bad: %#v", members[0]) } if members[0].PMax != ProtocolVersionMax { t.Fatalf("bad: %#v", members[0]) } if members[0].PCur != c.ProtocolVersion { t.Fatalf("bad: %#v", members[0]) } if members[0].DMin != c.DelegateProtocolMin { t.Fatalf("bad: %#v", members[0]) } if members[0].DMax != c.DelegateProtocolMax { t.Fatalf("bad: %#v", members[0]) } if members[0].DCur != c.DelegateProtocolVersion { t.Fatalf("bad: %#v", members[0]) } } func TestMemberList_CreateShutdown(t *testing.T) { m := GetMemberlist(t) m.schedule() if err := m.Shutdown(); err != nil { t.Fatalf("failed to shutdown %v", err) } } func TestMemberList_ResolveAddr(t *testing.T) { m := GetMemberlist(t) if _, err := m.resolveAddr("localhost"); err != nil { t.Fatalf("Could not resolve localhost: %s", err) } if _, err := m.resolveAddr("[::1]:80"); err != nil { t.Fatalf("Could not understand ipv6 pair: %s", err) } if _, err := m.resolveAddr("[::1]"); err != nil { t.Fatalf("Could not understand ipv6 non-pair") } if _, err := m.resolveAddr(":80"); err == nil { t.Fatalf("Understood hostless port") } if _, err := m.resolveAddr("localhost:80"); err != nil { t.Fatalf("Could not understand hostname port combo: %s", err) } if _, err := m.resolveAddr("localhost:80000"); err == nil { t.Fatalf("Understood too high port") } if _, err := m.resolveAddr("127.0.0.1:80"); err != nil { t.Fatalf("Could not understand hostname port combo: %s", err) } if _, err := m.resolveAddr("[2001:db8:a0b:12f0::1]:80"); err != nil { t.Fatalf("Could not understand hostname port combo: %s", err) } } type dnsHandler struct { t *testing.T } func (h dnsHandler) ServeDNS(w dns.ResponseWriter, r *dns.Msg) { if len(r.Question) != 1 { h.t.Fatalf("bad: %#v", r.Question) } name := "join.service.consul." question := r.Question[0] if question.Name != name || question.Qtype != dns.TypeANY { h.t.Fatalf("bad: %#v", question) } m := new(dns.Msg) m.SetReply(r) m.Authoritative = true m.RecursionAvailable = false m.Answer = append(m.Answer, &dns.A{ Hdr: dns.RR_Header{ Name: name, Rrtype: dns.TypeA, Class: dns.ClassINET}, A: net.ParseIP("127.0.0.1"), }) m.Answer = append(m.Answer, &dns.AAAA{ Hdr: dns.RR_Header{ Name: name, Rrtype: dns.TypeAAAA, Class: dns.ClassINET}, AAAA: net.ParseIP("2001:db8:a0b:12f0::1"), }) if err := w.WriteMsg(m); err != nil { h.t.Fatalf("err: %v", err) } } func TestMemberList_ResolveAddr_TCP_First(t *testing.T) { bind := "127.0.0.1:8600" var wg sync.WaitGroup wg.Add(1) server := &dns.Server{ Addr: bind, Handler: dnsHandler{t}, Net: "tcp", NotifyStartedFunc: wg.Done, } defer server.Shutdown() go func() { if err := server.ListenAndServe(); err != nil && !strings.Contains(err.Error(), "use of closed network connection") { t.Fatalf("err: %v", err) } }() wg.Wait() tmpFile, err := ioutil.TempFile("", "") if err != nil { t.Fatalf("err: %v", err) } defer os.Remove(tmpFile.Name()) content := []byte(fmt.Sprintf("nameserver %s", bind)) if _, err := tmpFile.Write(content); err != nil { t.Fatalf("err: %v", err) } if err := tmpFile.Close(); err != nil { t.Fatalf("err: %v", err) } m := GetMemberlist(t) m.config.DNSConfigPath = tmpFile.Name() m.setAlive() m.schedule() defer m.Shutdown() // Try with and without the trailing dot. hosts := []string{ "join.service.consul.", "join.service.consul", } for _, host := range hosts { ips, err := m.resolveAddr(host) if err != nil { t.Fatalf("err: %v", err) } port := uint16(m.config.BindPort) expected := []ipPort{ ipPort{net.ParseIP("127.0.0.1"), port}, ipPort{net.ParseIP("2001:db8:a0b:12f0::1"), port}, } if !reflect.DeepEqual(ips, expected) { t.Fatalf("bad: %#v", ips) } } } func TestMemberList_Members(t *testing.T) { n1 := &Node{Name: "test"} n2 := &Node{Name: "test2"} n3 := &Node{Name: "test3"} m := &Memberlist{} nodes := []*nodeState{ &nodeState{Node: *n1, State: stateAlive}, &nodeState{Node: *n2, State: stateDead}, &nodeState{Node: *n3, State: stateSuspect}, } m.nodes = nodes members := m.Members() if !reflect.DeepEqual(members, []*Node{n1, n3}) { t.Fatalf("bad members") } } func TestMemberlist_Join(t *testing.T) { m1 := GetMemberlist(t) m1.setAlive() m1.schedule() defer m1.Shutdown() // Create a second node c := DefaultLANConfig() addr1 := getBindAddr() c.Name = addr1.String() c.BindAddr = addr1.String() c.BindPort = m1.config.BindPort m2, err := Create(c) if err != nil { t.Fatalf("unexpected err: %s", err) } defer m2.Shutdown() num, err := m2.Join([]string{m1.config.BindAddr}) if num != 1 { t.Fatalf("unexpected 1: %d", num) } if err != nil { t.Fatalf("unexpected err: %s", err) } // Check the hosts if len(m2.Members()) != 2 { t.Fatalf("should have 2 nodes! %v", m2.Members()) } if m2.estNumNodes() != 2 { t.Fatalf("should have 2 nodes! %v", m2.Members()) } } type CustomMergeDelegate struct { invoked bool } func (c *CustomMergeDelegate) NotifyMerge(nodes []*Node) error { log.Printf("Cancel merge") c.invoked = true return fmt.Errorf("Custom merge canceled") } func TestMemberlist_Join_Cancel(t *testing.T) { m1 := GetMemberlist(t) merge1 := &CustomMergeDelegate{} m1.config.Merge = merge1 m1.setAlive() m1.schedule() defer m1.Shutdown() // Create a second node c := DefaultLANConfig() addr1 := getBindAddr() c.Name = addr1.String() c.BindAddr = addr1.String() c.BindPort = m1.config.BindPort m2, err := Create(c) if err != nil { t.Fatalf("unexpected err: %s", err) } merge2 := &CustomMergeDelegate{} m2.config.Merge = merge2 defer m2.Shutdown() num, err := m2.Join([]string{m1.config.BindAddr}) if num != 0 { t.Fatalf("unexpected 0: %d", num) } if !strings.Contains(err.Error(), "Custom merge canceled") { t.Fatalf("unexpected err: %s", err) } // Check the hosts if len(m2.Members()) != 1 { t.Fatalf("should have 1 nodes! %v", m2.Members()) } if len(m1.Members()) != 1 { t.Fatalf("should have 1 nodes! %v", m1.Members()) } // Check delegate invocation if !merge1.invoked { t.Fatalf("should invoke delegate") } if !merge2.invoked { t.Fatalf("should invoke delegate") } } type CustomAliveDelegate struct { Ignore string count int } func (c *CustomAliveDelegate) NotifyAlive(peer *Node) error { c.count++ if peer.Name == c.Ignore { return nil } log.Printf("Cancel alive") return fmt.Errorf("Custom alive canceled") } func TestMemberlist_Join_Cancel_Passive(t *testing.T) { m1 := GetMemberlist(t) alive1 := &CustomAliveDelegate{ Ignore: m1.config.Name, } m1.config.Alive = alive1 m1.setAlive() m1.schedule() defer m1.Shutdown() // Create a second node c := DefaultLANConfig() addr1 := getBindAddr() c.Name = addr1.String() c.BindAddr = addr1.String() c.BindPort = m1.config.BindPort m2, err := Create(c) if err != nil { t.Fatalf("unexpected err: %s", err) } alive2 := &CustomAliveDelegate{ Ignore: c.Name, } m2.config.Alive = alive2 defer m2.Shutdown() num, err := m2.Join([]string{m1.config.BindAddr}) if num != 1 { t.Fatalf("unexpected 1: %d", num) } if err != nil { t.Fatalf("err: %s", err) } // Check the hosts if len(m2.Members()) != 1 { t.Fatalf("should have 1 nodes! %v", m2.Members()) } if len(m1.Members()) != 1 { t.Fatalf("should have 1 nodes! %v", m1.Members()) } // Check delegate invocation if alive1.count == 0 { t.Fatalf("should invoke delegate: %d", alive1.count) } if alive2.count == 0 { t.Fatalf("should invoke delegate: %d", alive2.count) } } func TestMemberlist_Join_protocolVersions(t *testing.T) { c1 := testConfig() c2 := testConfig() c3 := testConfig() c3.ProtocolVersion = ProtocolVersionMax m1, err := Create(c1) if err != nil { t.Fatalf("err: %s", err) } defer m1.Shutdown() m2, err := Create(c2) if err != nil { t.Fatalf("err: %s", err) } defer m2.Shutdown() m3, err := Create(c3) if err != nil { t.Fatalf("err: %s", err) } defer m3.Shutdown() _, err = m1.Join([]string{c2.BindAddr}) if err != nil { t.Fatalf("err: %s", err) } yield() _, err = m1.Join([]string{c3.BindAddr}) if err != nil { t.Fatalf("err: %s", err) } } func TestMemberlist_Leave(t *testing.T) { m1 := GetMemberlist(t) m1.setAlive() m1.schedule() defer m1.Shutdown() // Create a second node c := DefaultLANConfig() addr1 := getBindAddr() c.Name = addr1.String() c.BindAddr = addr1.String() c.BindPort = m1.config.BindPort c.GossipInterval = time.Millisecond m2, err := Create(c) if err != nil { t.Fatalf("unexpected err: %s", err) } defer m2.Shutdown() num, err := m2.Join([]string{m1.config.BindAddr}) if num != 1 { t.Fatalf("unexpected 1: %d", num) } if err != nil { t.Fatalf("unexpected err: %s", err) } // Check the hosts if len(m2.Members()) != 2 { t.Fatalf("should have 2 nodes! %v", m2.Members()) } if len(m1.Members()) != 2 { t.Fatalf("should have 2 nodes! %v", m2.Members()) } // Leave m1.Leave(time.Second) // Wait for leave time.Sleep(10 * time.Millisecond) // m1 should think dead if len(m1.Members()) != 1 { t.Fatalf("should have 1 node") } if len(m2.Members()) != 1 { t.Fatalf("should have 1 node") } } func TestMemberlist_JoinShutdown(t *testing.T) { m1 := GetMemberlist(t) m1.setAlive() m1.schedule() // Create a second node c := DefaultLANConfig() addr1 := getBindAddr() c.Name = addr1.String() c.BindAddr = addr1.String() c.BindPort = m1.config.BindPort c.ProbeInterval = time.Millisecond c.ProbeTimeout = 100 * time.Microsecond c.SuspicionMaxTimeoutMult = 1 m2, err := Create(c) if err != nil { t.Fatalf("unexpected err: %s", err) } defer m2.Shutdown() num, err := m2.Join([]string{m1.config.BindAddr}) if num != 1 { t.Fatalf("unexpected 1: %d", num) } if err != nil { t.Fatalf("unexpected err: %s", err) } // Check the hosts if len(m2.Members()) != 2 { t.Fatalf("should have 2 nodes! %v", m2.Members()) } m1.Shutdown() time.Sleep(10 * time.Millisecond) if len(m2.Members()) != 1 { t.Fatalf("should have 1 nodes! %v", m2.Members()) } } func TestMemberlist_delegateMeta(t *testing.T) { c1 := testConfig() c2 := testConfig() c1.Delegate = &MockDelegate{meta: []byte("web")} c2.Delegate = &MockDelegate{meta: []byte("lb")} m1, err := Create(c1) if err != nil { t.Fatalf("err: %s", err) } defer m1.Shutdown() m2, err := Create(c2) if err != nil { t.Fatalf("err: %s", err) } defer m2.Shutdown() _, err = m1.Join([]string{c2.BindAddr}) if err != nil { t.Fatalf("err: %s", err) } yield() var roles map[string]string // Check the roles of members of m1 m1m := m1.Members() if len(m1m) != 2 { t.Fatalf("bad: %#v", m1m) } roles = make(map[string]string) for _, m := range m1m { roles[m.Name] = string(m.Meta) } if r := roles[c1.Name]; r != "web" { t.Fatalf("bad role for %s: %s", c1.Name, r) } if r := roles[c2.Name]; r != "lb" { t.Fatalf("bad role for %s: %s", c2.Name, r) } // Check the roles of members of m2 m2m := m2.Members() if len(m2m) != 2 { t.Fatalf("bad: %#v", m2m) } roles = make(map[string]string) for _, m := range m2m { roles[m.Name] = string(m.Meta) } if r := roles[c1.Name]; r != "web" { t.Fatalf("bad role for %s: %s", c1.Name, r) } if r := roles[c2.Name]; r != "lb" { t.Fatalf("bad role for %s: %s", c2.Name, r) } } func TestMemberlist_delegateMeta_Update(t *testing.T) { c1 := testConfig() c2 := testConfig() mock1 := &MockDelegate{meta: []byte("web")} mock2 := &MockDelegate{meta: []byte("lb")} c1.Delegate = mock1 c2.Delegate = mock2 m1, err := Create(c1) if err != nil { t.Fatalf("err: %s", err) } defer m1.Shutdown() m2, err := Create(c2) if err != nil { t.Fatalf("err: %s", err) } defer m2.Shutdown() _, err = m1.Join([]string{c2.BindAddr}) if err != nil { t.Fatalf("err: %s", err) } yield() // Update the meta data roles mock1.meta = []byte("api") mock2.meta = []byte("db") m1.UpdateNode(0) m2.UpdateNode(0) yield() // Check the updates have propagated var roles map[string]string // Check the roles of members of m1 m1m := m1.Members() if len(m1m) != 2 { t.Fatalf("bad: %#v", m1m) } roles = make(map[string]string) for _, m := range m1m { roles[m.Name] = string(m.Meta) } if r := roles[c1.Name]; r != "api" { t.Fatalf("bad role for %s: %s", c1.Name, r) } if r := roles[c2.Name]; r != "db" { t.Fatalf("bad role for %s: %s", c2.Name, r) } // Check the roles of members of m2 m2m := m2.Members() if len(m2m) != 2 { t.Fatalf("bad: %#v", m2m) } roles = make(map[string]string) for _, m := range m2m { roles[m.Name] = string(m.Meta) } if r := roles[c1.Name]; r != "api" { t.Fatalf("bad role for %s: %s", c1.Name, r) } if r := roles[c2.Name]; r != "db" { t.Fatalf("bad role for %s: %s", c2.Name, r) } } func TestMemberlist_UserData(t *testing.T) { m1, d1 := GetMemberlistDelegate(t) d1.state = []byte("something") m1.setAlive() m1.schedule() defer m1.Shutdown() // Create a second delegate with things to send d2 := &MockDelegate{} d2.broadcasts = [][]byte{ []byte("test"), []byte("foobar"), } d2.state = []byte("my state") // Create a second node c := DefaultLANConfig() addr1 := getBindAddr() c.Name = addr1.String() c.BindAddr = addr1.String() c.BindPort = m1.config.BindPort c.GossipInterval = time.Millisecond c.PushPullInterval = time.Millisecond c.Delegate = d2 m2, err := Create(c) if err != nil { t.Fatalf("unexpected err: %s", err) } num, err := m2.Join([]string{m1.config.BindAddr}) if num != 1 { t.Fatalf("unexpected 1: %d", num) } if err != nil { t.Fatalf("unexpected err: %s", err) } defer m2.Shutdown() // Check the hosts if m2.NumMembers() != 2 { t.Fatalf("should have 2 nodes! %v", m2.Members()) } // Wait for a little while time.Sleep(3 * time.Millisecond) // Ensure we got the messages if len(d1.msgs) != 2 { t.Fatalf("should have 2 messages!") } if !reflect.DeepEqual(d1.msgs[0], []byte("test")) { t.Fatalf("bad msg %v", d1.msgs[0]) } if !reflect.DeepEqual(d1.msgs[1], []byte("foobar")) { t.Fatalf("bad msg %v", d1.msgs[1]) } // Check the push/pull state if !reflect.DeepEqual(d1.remoteState, []byte("my state")) { t.Fatalf("bad state %s", d1.remoteState) } if !reflect.DeepEqual(d2.remoteState, []byte("something")) { t.Fatalf("bad state %s", d2.remoteState) } } func TestMemberlist_SendTo(t *testing.T) { m1, d1 := GetMemberlistDelegate(t) m1.setAlive() m1.schedule() defer m1.Shutdown() // Create a second delegate with things to send d2 := &MockDelegate{} // Create a second node c := DefaultLANConfig() addr1 := getBindAddr() c.Name = addr1.String() c.BindAddr = addr1.String() c.BindPort = m1.config.BindPort c.GossipInterval = time.Millisecond c.PushPullInterval = time.Millisecond c.Delegate = d2 m2, err := Create(c) if err != nil { t.Fatalf("unexpected err: %s", err) } defer m2.Shutdown() num, err := m2.Join([]string{m1.config.BindAddr}) if num != 1 { t.Fatalf("unexpected 1: %d", num) } if err != nil { t.Fatalf("unexpected err: %s", err) } // Check the hosts if m2.NumMembers() != 2 { t.Fatalf("should have 2 nodes! %v", m2.Members()) } // Try to do a direct send m2Addr := &net.UDPAddr{IP: addr1, Port: c.BindPort} if err := m1.SendTo(m2Addr, []byte("ping")); err != nil { t.Fatalf("err: %v", err) } m1Addr := &net.UDPAddr{IP: net.ParseIP(m1.config.BindAddr), Port: m1.config.BindPort} if err := m2.SendTo(m1Addr, []byte("pong")); err != nil { t.Fatalf("err: %v", err) } // Wait for a little while time.Sleep(3 * time.Millisecond) // Ensure we got the messages if len(d1.msgs) != 1 { t.Fatalf("should have 1 messages!") } if !reflect.DeepEqual(d1.msgs[0], []byte("pong")) { t.Fatalf("bad msg %v", d1.msgs[0]) } if len(d2.msgs) != 1 { t.Fatalf("should have 1 messages!") } if !reflect.DeepEqual(d2.msgs[0], []byte("ping")) { t.Fatalf("bad msg %v", d2.msgs[0]) } } func TestMemberlistProtocolVersion(t *testing.T) { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() c.ProtocolVersion = ProtocolVersionMax m, err := Create(c) if err != nil { t.Fatalf("err: %s", err) } defer m.Shutdown() result := m.ProtocolVersion() if result != ProtocolVersionMax { t.Fatalf("bad: %d", result) } } func TestMemberlist_Join_DeadNode(t *testing.T) { m1 := GetMemberlist(t) m1.config.TCPTimeout = 50 * time.Millisecond m1.setAlive() m1.schedule() defer m1.Shutdown() // Create a second "node", which is just a TCP listener that // does not ever respond. This is to test our deadliens addr1 := getBindAddr() list, err := net.Listen("tcp", fmt.Sprintf("%s:%d", addr1.String(), m1.config.BindPort)) if err != nil { t.Fatalf("err: %v", err) } defer list.Close() // Ensure we don't hang forever timer := time.AfterFunc(100*time.Millisecond, func() { panic("should have timed out by now") }) defer timer.Stop() num, err := m1.Join([]string{addr1.String()}) if num != 0 { t.Fatalf("unexpected 0: %d", num) } if err == nil { t.Fatal("expect err") } } // Tests that nodes running different versions of the protocol can successfully // discover each other and add themselves to their respective member lists. func TestMemberlist_Join_Prototocol_Compatibility(t *testing.T) { testProtocolVersionPair := func(t *testing.T, pv1 uint8, pv2 uint8) { c1 := testConfig() c1.ProtocolVersion = pv1 m1, err := NewMemberlistOnOpenPort(c1) if err != nil { t.Fatalf("failed to start: %v", err) } m1.setAlive() m1.schedule() defer m1.Shutdown() c2 := DefaultLANConfig() addr1 := getBindAddr() c2.Name = addr1.String() c2.BindAddr = addr1.String() c2.BindPort = m1.config.BindPort c2.ProtocolVersion = pv2 m2, err := Create(c2) if err != nil { t.Fatalf("unexpected err: %s", err) } defer m2.Shutdown() num, err := m2.Join([]string{m1.config.BindAddr}) if num != 1 { t.Fatalf("unexpected 1: %d", num) } if err != nil { t.Fatalf("unexpected err: %s", err) } // Check the hosts if len(m2.Members()) != 2 { t.Fatalf("should have 2 nodes! %v", m2.Members()) } // Check the hosts if len(m1.Members()) != 2 { t.Fatalf("should have 2 nodes! %v", m1.Members()) } } testProtocolVersionPair(t, 2, 1) testProtocolVersionPair(t, 2, 3) testProtocolVersionPair(t, 3, 2) testProtocolVersionPair(t, 3, 1) } func TestMemberlist_Join_IPv6(t *testing.T) { // Since this binds to all interfaces we need to exclude other tests // from grabbing an interface. bindLock.Lock() defer bindLock.Unlock() c1 := DefaultLANConfig() c1.Name = "A" c1.BindAddr = "[::1]" var m1 *Memberlist var err error for i := 0; i < 100; i++ { c1.BindPort = 23456 + i m1, err = Create(c1) if err == nil { break } } if err != nil { t.Fatalf("unexpected err: %s", err) } defer m1.Shutdown() // Create a second node c2 := DefaultLANConfig() c2.Name = "B" c2.BindAddr = "[::1]" var m2 *Memberlist for i := 0; i < 100; i++ { c2.BindPort = c1.BindPort + 1 + i m2, err = Create(c2) if err == nil { break } } if err != nil { t.Fatalf("unexpected err: %s", err) } defer m2.Shutdown() num, err := m2.Join([]string{fmt.Sprintf("%s:%d", m1.config.BindAddr, 23456)}) if num != 1 { t.Fatalf("unexpected 1: %d", num) } if err != nil { t.Fatalf("unexpected err: %s", err) } // Check the hosts if len(m2.Members()) != 2 { t.Fatalf("should have 2 nodes! %v", m2.Members()) } if len(m1.Members()) != 2 { t.Fatalf("should have 2 nodes! %v", m2.Members()) } } func TestAdvertiseAddr(t *testing.T) { c := testConfig() c.AdvertiseAddr = "127.0.1.100" c.AdvertisePort = 23456 m, err := Create(c) if err != nil { t.Fatalf("err: %s", err) } defer m.Shutdown() yield() members := m.Members() if len(members) != 1 { t.Fatalf("bad number of members") } if bytes.Compare(members[0].Addr, []byte{127, 0, 1, 100}) != 0 { t.Fatalf("bad: %#v", members[0]) } if members[0].Port != 23456 { t.Fatalf("bad: %#v", members[0]) } } type MockConflict struct { existing *Node other *Node } func (m *MockConflict) NotifyConflict(existing, other *Node) { m.existing = existing m.other = other } func TestMemberlist_conflictDelegate(t *testing.T) { c1 := testConfig() c2 := testConfig() mock := &MockConflict{} c1.Conflict = mock // Ensure name conflict c2.Name = c1.Name m1, err := Create(c1) if err != nil { t.Fatalf("err: %s", err) } defer m1.Shutdown() m2, err := Create(c2) if err != nil { t.Fatalf("err: %s", err) } defer m2.Shutdown() _, err = m1.Join([]string{c2.BindAddr}) if err != nil { t.Fatalf("err: %s", err) } yield() // Ensure we were notified if mock.existing == nil || mock.other == nil { t.Fatalf("should get notified") } if mock.existing.Name != mock.other.Name { t.Fatalf("bad: %v %v", mock.existing, mock.other) } } type MockPing struct { other *Node rtt time.Duration payload []byte } func (m *MockPing) NotifyPingComplete(other *Node, rtt time.Duration, payload []byte) { m.other = other m.rtt = rtt m.payload = payload } const DEFAULT_PAYLOAD = "whatever" func (m *MockPing) AckPayload() []byte { return []byte(DEFAULT_PAYLOAD) } func TestMemberlist_PingDelegate(t *testing.T) { m1 := GetMemberlist(t) m1.config.Ping = &MockPing{} m1.setAlive() m1.schedule() defer m1.Shutdown() // Create a second node c := DefaultLANConfig() addr1 := getBindAddr() c.Name = addr1.String() c.BindAddr = addr1.String() c.BindPort = m1.config.BindPort c.ProbeInterval = time.Millisecond mock := &MockPing{} c.Ping = mock m2, err := Create(c) if err != nil { t.Fatalf("err: %s", err) } defer m2.Shutdown() _, err = m2.Join([]string{m1.config.BindAddr}) if err != nil { t.Fatalf("err: %s", err) } yield() // Ensure we were notified if mock.other == nil { t.Fatalf("should get notified") } if !reflect.DeepEqual(mock.other, m1.LocalNode()) { t.Fatalf("not notified about the correct node; expected: %+v; actual: %+v", m2.LocalNode(), mock.other) } if mock.rtt <= 0 { t.Fatalf("rtt should be greater than 0") } if bytes.Compare(mock.payload, []byte(DEFAULT_PAYLOAD)) != 0 { t.Fatalf("incorrect payload. expected: %v; actual: %v", []byte(DEFAULT_PAYLOAD), mock.payload) } } // Consul bug, rapid restart (before failure detection), // with an updated meta data. Should be at incarnation 1 for // both. // // This test is uncommented because it requires that either we // can rebind the socket (SO_REUSEPORT) which Go does not allow, // OR we must disable the address conflict checking in memberlist. // I just comment out that code to test this case. // //func TestMemberlist_Restart_delegateMeta_Update(t *testing.T) { // c1 := testConfig() // c2 := testConfig() // mock1 := &MockDelegate{meta: []byte("web")} // mock2 := &MockDelegate{meta: []byte("lb")} // c1.Delegate = mock1 // c2.Delegate = mock2 // m1, err := Create(c1) // if err != nil { // t.Fatalf("err: %s", err) // } // defer m1.Shutdown() // m2, err := Create(c2) // if err != nil { // t.Fatalf("err: %s", err) // } // defer m2.Shutdown() // _, err = m1.Join([]string{c2.BindAddr}) // if err != nil { // t.Fatalf("err: %s", err) // } // yield() // // Recreate m1 with updated meta // m1.Shutdown() // c3 := testConfig() // c3.Name = c1.Name // c3.Delegate = mock1 // c3.GossipInterval = time.Millisecond // mock1.meta = []byte("api") // m1, err = Create(c3) // if err != nil { // t.Fatalf("err: %s", err) // } // defer m1.Shutdown() // _, err = m1.Join([]string{c2.BindAddr}) // if err != nil { // t.Fatalf("err: %s", err) // } // yield() // yield() // // Check the updates have propagated // var roles map[string]string // // Check the roles of members of m1 // m1m := m1.Members() // if len(m1m) != 2 { // t.Fatalf("bad: %#v", m1m) // } // roles = make(map[string]string) // for _, m := range m1m { // roles[m.Name] = string(m.Meta) // } // if r := roles[c1.Name]; r != "api" { // t.Fatalf("bad role for %s: %s", c1.Name, r) // } // if r := roles[c2.Name]; r != "lb" { // t.Fatalf("bad role for %s: %s", c2.Name, r) // } // // Check the roles of members of m2 // m2m := m2.Members() // if len(m2m) != 2 { // t.Fatalf("bad: %#v", m2m) // } // roles = make(map[string]string) // for _, m := range m2m { // roles[m.Name] = string(m.Meta) // } // if r := roles[c1.Name]; r != "api" { // t.Fatalf("bad role for %s: %s", c1.Name, r) // } // if r := roles[c2.Name]; r != "lb" { // t.Fatalf("bad role for %s: %s", c2.Name, r) // } //} memberlist-0.1.0/merge_delegate.go000066400000000000000000000010721307374264600171470ustar00rootroot00000000000000package memberlist // MergeDelegate is used to involve a client in // a potential cluster merge operation. Namely, when // a node does a TCP push/pull (as part of a join), // the delegate is involved and allowed to cancel the join // based on custom logic. The merge delegate is NOT invoked // as part of the push-pull anti-entropy. type MergeDelegate interface { // NotifyMerge is invoked when a merge could take place. // Provides a list of the nodes known by the peer. If // the return value is non-nil, the merge is canceled. NotifyMerge(peers []*Node) error } memberlist-0.1.0/mock_transport.go000066400000000000000000000050201307374264600172600ustar00rootroot00000000000000package memberlist import ( "fmt" "net" "strconv" "time" ) // MockNetwork is used as a factory that produces MockTransport instances which // are uniquely addressed and wired up to talk to each other. type MockNetwork struct { transports map[string]*MockTransport port int } // NewTransport returns a new MockTransport with a unique address, wired up to // talk to the other transports in the MockNetwork. func (n *MockNetwork) NewTransport() *MockTransport { n.port += 1 addr := fmt.Sprintf("127.0.0.1:%d", n.port) transport := &MockTransport{ net: n, addr: &MockAddress{addr}, packetCh: make(chan *Packet), streamCh: make(chan net.Conn), } if n.transports == nil { n.transports = make(map[string]*MockTransport) } n.transports[addr] = transport return transport } // MockAddress is a wrapper which adds the net.Addr interface to our mock // address scheme. type MockAddress struct { addr string } // See net.Addr. func (a *MockAddress) Network() string { return "mock" } // See net.Addr. func (a *MockAddress) String() string { return a.addr } // MockTransport directly plumbs messages to other transports its MockNetwork. type MockTransport struct { net *MockNetwork addr *MockAddress packetCh chan *Packet streamCh chan net.Conn } // See Transport. func (t *MockTransport) FinalAdvertiseAddr(string, int) (net.IP, int, error) { host, portStr, err := net.SplitHostPort(t.addr.String()) if err != nil { return nil, 0, err } ip := net.ParseIP(host) if ip == nil { return nil, 0, fmt.Errorf("Failed to parse IP %q", host) } port, err := strconv.ParseInt(portStr, 10, 16) if err != nil { return nil, 0, err } return ip, int(port), nil } // See Transport. func (t *MockTransport) WriteTo(b []byte, addr string) (time.Time, error) { dest, ok := t.net.transports[addr] if !ok { return time.Time{}, fmt.Errorf("No route to %q", addr) } now := time.Now() dest.packetCh <- &Packet{ Buf: b, From: t.addr, Timestamp: now, } return now, nil } // See Transport. func (t *MockTransport) PacketCh() <-chan *Packet { return t.packetCh } // See Transport. func (t *MockTransport) DialTimeout(addr string, timeout time.Duration) (net.Conn, error) { dest, ok := t.net.transports[addr] if !ok { return nil, fmt.Errorf("No route to %q", addr) } p1, p2 := net.Pipe() dest.streamCh <- p1 return p2, nil } // See Transport. func (t *MockTransport) StreamCh() <-chan net.Conn { return t.streamCh } // See Transport. func (t *MockTransport) Shutdown() error { return nil } memberlist-0.1.0/net.go000066400000000000000000000723161307374264600150150ustar00rootroot00000000000000package memberlist import ( "bufio" "bytes" "encoding/binary" "fmt" "hash/crc32" "io" "net" "time" "github.com/armon/go-metrics" "github.com/hashicorp/go-msgpack/codec" ) // This is the minimum and maximum protocol version that we can // _understand_. We're allowed to speak at any version within this // range. This range is inclusive. const ( ProtocolVersionMin uint8 = 1 // Version 3 added support for TCP pings but we kept the default // protocol version at 2 to ease transition to this new feature. // A memberlist speaking version 2 of the protocol will attempt // to TCP ping another memberlist who understands version 3 or // greater. // // Version 4 added support for nacks as part of indirect probes. // A memberlist speaking version 2 of the protocol will expect // nacks from another memberlist who understands version 4 or // greater, and likewise nacks will be sent to memberlists who // understand version 4 or greater. ProtocolVersion2Compatible = 2 ProtocolVersionMax = 5 ) // messageType is an integer ID of a type of message that can be received // on network channels from other members. type messageType uint8 // The list of available message types. const ( pingMsg messageType = iota indirectPingMsg ackRespMsg suspectMsg aliveMsg deadMsg pushPullMsg compoundMsg userMsg // User mesg, not handled by us compressMsg encryptMsg nackRespMsg hasCrcMsg ) // compressionType is used to specify the compression algorithm type compressionType uint8 const ( lzwAlgo compressionType = iota ) const ( MetaMaxSize = 512 // Maximum size for node meta data compoundHeaderOverhead = 2 // Assumed header overhead compoundOverhead = 2 // Assumed overhead per entry in compoundHeader userMsgOverhead = 1 blockingWarning = 10 * time.Millisecond // Warn if a UDP packet takes this long to process maxPushStateBytes = 10 * 1024 * 1024 ) // ping request sent directly to node type ping struct { SeqNo uint32 // Node is sent so the target can verify they are // the intended recipient. This is to protect again an agent // restart with a new name. Node string } // indirect ping sent to an indirect ndoe type indirectPingReq struct { SeqNo uint32 Target []byte Port uint16 Node string Nack bool // true if we'd like a nack back } // ack response is sent for a ping type ackResp struct { SeqNo uint32 Payload []byte } // nack response is sent for an indirect ping when the pinger doesn't hear from // the ping-ee within the configured timeout. This lets the original node know // that the indirect ping attempt happened but didn't succeed. type nackResp struct { SeqNo uint32 } // suspect is broadcast when we suspect a node is dead type suspect struct { Incarnation uint32 Node string From string // Include who is suspecting } // alive is broadcast when we know a node is alive. // Overloaded for nodes joining type alive struct { Incarnation uint32 Node string Addr []byte Port uint16 Meta []byte // The versions of the protocol/delegate that are being spoken, order: // pmin, pmax, pcur, dmin, dmax, dcur Vsn []uint8 } // dead is broadcast when we confirm a node is dead // Overloaded for nodes leaving type dead struct { Incarnation uint32 Node string From string // Include who is suspecting } // pushPullHeader is used to inform the // otherside how many states we are transferring type pushPullHeader struct { Nodes int UserStateLen int // Encodes the byte lengh of user state Join bool // Is this a join request or a anti-entropy run } // userMsgHeader is used to encapsulate a userMsg type userMsgHeader struct { UserMsgLen int // Encodes the byte lengh of user state } // pushNodeState is used for pushPullReq when we are // transferring out node states type pushNodeState struct { Name string Addr []byte Port uint16 Meta []byte Incarnation uint32 State nodeStateType Vsn []uint8 // Protocol versions } // compress is used to wrap an underlying payload // using a specified compression algorithm type compress struct { Algo compressionType Buf []byte } // msgHandoff is used to transfer a message between goroutines type msgHandoff struct { msgType messageType buf []byte from net.Addr } // encryptionVersion returns the encryption version to use func (m *Memberlist) encryptionVersion() encryptionVersion { switch m.ProtocolVersion() { case 1: return 0 default: return 1 } } // streamListen is a long running goroutine that pulls incoming streams from the // transport and hands them off for processing. func (m *Memberlist) streamListen() { for { select { case conn := <-m.transport.StreamCh(): go m.handleConn(conn) case <-m.shutdownCh: return } } } // handleConn handles a single incoming stream connection from the transport. func (m *Memberlist) handleConn(conn net.Conn) { m.logger.Printf("[DEBUG] memberlist: Stream connection %s", LogConn(conn)) defer conn.Close() metrics.IncrCounter([]string{"memberlist", "tcp", "accept"}, 1) conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) msgType, bufConn, dec, err := m.readStream(conn) if err != nil { if err != io.EOF { m.logger.Printf("[ERR] memberlist: failed to receive: %s %s", err, LogConn(conn)) } return } switch msgType { case userMsg: if err := m.readUserMsg(bufConn, dec); err != nil { m.logger.Printf("[ERR] memberlist: Failed to receive user message: %s %s", err, LogConn(conn)) } case pushPullMsg: join, remoteNodes, userState, err := m.readRemoteState(bufConn, dec) if err != nil { m.logger.Printf("[ERR] memberlist: Failed to read remote state: %s %s", err, LogConn(conn)) return } if err := m.sendLocalState(conn, join); err != nil { m.logger.Printf("[ERR] memberlist: Failed to push local state: %s %s", err, LogConn(conn)) return } if err := m.mergeRemoteState(join, remoteNodes, userState); err != nil { m.logger.Printf("[ERR] memberlist: Failed push/pull merge: %s %s", err, LogConn(conn)) return } case pingMsg: var p ping if err := dec.Decode(&p); err != nil { m.logger.Printf("[ERR] memberlist: Failed to decode ping: %s %s", err, LogConn(conn)) return } if p.Node != "" && p.Node != m.config.Name { m.logger.Printf("[WARN] memberlist: Got ping for unexpected node %s %s", p.Node, LogConn(conn)) return } ack := ackResp{p.SeqNo, nil} out, err := encode(ackRespMsg, &ack) if err != nil { m.logger.Printf("[ERR] memberlist: Failed to encode ack: %s", err) return } err = m.rawSendMsgStream(conn, out.Bytes()) if err != nil { m.logger.Printf("[ERR] memberlist: Failed to send ack: %s %s", err, LogConn(conn)) return } default: m.logger.Printf("[ERR] memberlist: Received invalid msgType (%d) %s", msgType, LogConn(conn)) } } // packetListen is a long running goroutine that pulls packets out of the // transport and hands them off for processing. func (m *Memberlist) packetListen() { for { select { case packet := <-m.transport.PacketCh(): m.ingestPacket(packet.Buf, packet.From, packet.Timestamp) case <-m.shutdownCh: return } } } func (m *Memberlist) ingestPacket(buf []byte, from net.Addr, timestamp time.Time) { // Check if encryption is enabled if m.config.EncryptionEnabled() { // Decrypt the payload plain, err := decryptPayload(m.config.Keyring.GetKeys(), buf, nil) if err != nil { m.logger.Printf("[ERR] memberlist: Decrypt packet failed: %v %s", err, LogAddress(from)) return } // Continue processing the plaintext buffer buf = plain } // See if there's a checksum included to verify the contents of the message if len(buf) >= 5 && messageType(buf[0]) == hasCrcMsg { crc := crc32.ChecksumIEEE(buf[5:]) expected := binary.BigEndian.Uint32(buf[1:5]) if crc != expected { m.logger.Printf("[WARN] memberlist: Got invalid checksum for UDP packet: %x, %x", crc, expected) return } m.handleCommand(buf[5:], from, timestamp) } else { m.handleCommand(buf, from, timestamp) } } func (m *Memberlist) handleCommand(buf []byte, from net.Addr, timestamp time.Time) { // Decode the message type msgType := messageType(buf[0]) buf = buf[1:] // Switch on the msgType switch msgType { case compoundMsg: m.handleCompound(buf, from, timestamp) case compressMsg: m.handleCompressed(buf, from, timestamp) case pingMsg: m.handlePing(buf, from) case indirectPingMsg: m.handleIndirectPing(buf, from) case ackRespMsg: m.handleAck(buf, from, timestamp) case nackRespMsg: m.handleNack(buf, from) case suspectMsg: fallthrough case aliveMsg: fallthrough case deadMsg: fallthrough case userMsg: select { case m.handoff <- msgHandoff{msgType, buf, from}: default: m.logger.Printf("[WARN] memberlist: handler queue full, dropping message (%d) %s", msgType, LogAddress(from)) } default: m.logger.Printf("[ERR] memberlist: msg type (%d) not supported %s", msgType, LogAddress(from)) } } // packetHandler is a long running goroutine that processes messages received // over the packet interface, but is decoupled from the listener to avoid // blocking the listener which may cause ping/ack messages to be delayed. func (m *Memberlist) packetHandler() { for { select { case msg := <-m.handoff: msgType := msg.msgType buf := msg.buf from := msg.from switch msgType { case suspectMsg: m.handleSuspect(buf, from) case aliveMsg: m.handleAlive(buf, from) case deadMsg: m.handleDead(buf, from) case userMsg: m.handleUser(buf, from) default: m.logger.Printf("[ERR] memberlist: Message type (%d) not supported %s (packet handler)", msgType, LogAddress(from)) } case <-m.shutdownCh: return } } } func (m *Memberlist) handleCompound(buf []byte, from net.Addr, timestamp time.Time) { // Decode the parts trunc, parts, err := decodeCompoundMessage(buf) if err != nil { m.logger.Printf("[ERR] memberlist: Failed to decode compound request: %s %s", err, LogAddress(from)) return } // Log any truncation if trunc > 0 { m.logger.Printf("[WARN] memberlist: Compound request had %d truncated messages %s", trunc, LogAddress(from)) } // Handle each message for _, part := range parts { m.handleCommand(part, from, timestamp) } } func (m *Memberlist) handlePing(buf []byte, from net.Addr) { var p ping if err := decode(buf, &p); err != nil { m.logger.Printf("[ERR] memberlist: Failed to decode ping request: %s %s", err, LogAddress(from)) return } // If node is provided, verify that it is for us if p.Node != "" && p.Node != m.config.Name { m.logger.Printf("[WARN] memberlist: Got ping for unexpected node '%s' %s", p.Node, LogAddress(from)) return } var ack ackResp ack.SeqNo = p.SeqNo if m.config.Ping != nil { ack.Payload = m.config.Ping.AckPayload() } if err := m.encodeAndSendMsg(from.String(), ackRespMsg, &ack); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send ack: %s %s", err, LogAddress(from)) } } func (m *Memberlist) handleIndirectPing(buf []byte, from net.Addr) { var ind indirectPingReq if err := decode(buf, &ind); err != nil { m.logger.Printf("[ERR] memberlist: Failed to decode indirect ping request: %s %s", err, LogAddress(from)) return } // For proto versions < 2, there is no port provided. Mask old // behavior by using the configured port. if m.ProtocolVersion() < 2 || ind.Port == 0 { ind.Port = uint16(m.config.BindPort) } // Send a ping to the correct host. localSeqNo := m.nextSeqNo() ping := ping{SeqNo: localSeqNo, Node: ind.Node} // Setup a response handler to relay the ack cancelCh := make(chan struct{}) respHandler := func(payload []byte, timestamp time.Time) { // Try to prevent the nack if we've caught it in time. close(cancelCh) // Forward the ack back to the requestor. ack := ackResp{ind.SeqNo, nil} if err := m.encodeAndSendMsg(from.String(), ackRespMsg, &ack); err != nil { m.logger.Printf("[ERR] memberlist: Failed to forward ack: %s %s", err, LogAddress(from)) } } m.setAckHandler(localSeqNo, respHandler, m.config.ProbeTimeout) // Send the ping. addr := joinHostPort(net.IP(ind.Target).String(), ind.Port) if err := m.encodeAndSendMsg(addr, pingMsg, &ping); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send ping: %s %s", err, LogAddress(from)) } // Setup a timer to fire off a nack if no ack is seen in time. if ind.Nack { go func() { select { case <-cancelCh: return case <-time.After(m.config.ProbeTimeout): nack := nackResp{ind.SeqNo} if err := m.encodeAndSendMsg(from.String(), nackRespMsg, &nack); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send nack: %s %s", err, LogAddress(from)) } } }() } } func (m *Memberlist) handleAck(buf []byte, from net.Addr, timestamp time.Time) { var ack ackResp if err := decode(buf, &ack); err != nil { m.logger.Printf("[ERR] memberlist: Failed to decode ack response: %s %s", err, LogAddress(from)) return } m.invokeAckHandler(ack, timestamp) } func (m *Memberlist) handleNack(buf []byte, from net.Addr) { var nack nackResp if err := decode(buf, &nack); err != nil { m.logger.Printf("[ERR] memberlist: Failed to decode nack response: %s %s", err, LogAddress(from)) return } m.invokeNackHandler(nack) } func (m *Memberlist) handleSuspect(buf []byte, from net.Addr) { var sus suspect if err := decode(buf, &sus); err != nil { m.logger.Printf("[ERR] memberlist: Failed to decode suspect message: %s %s", err, LogAddress(from)) return } m.suspectNode(&sus) } func (m *Memberlist) handleAlive(buf []byte, from net.Addr) { var live alive if err := decode(buf, &live); err != nil { m.logger.Printf("[ERR] memberlist: Failed to decode alive message: %s %s", err, LogAddress(from)) return } // For proto versions < 2, there is no port provided. Mask old // behavior by using the configured port if m.ProtocolVersion() < 2 || live.Port == 0 { live.Port = uint16(m.config.BindPort) } m.aliveNode(&live, nil, false) } func (m *Memberlist) handleDead(buf []byte, from net.Addr) { var d dead if err := decode(buf, &d); err != nil { m.logger.Printf("[ERR] memberlist: Failed to decode dead message: %s %s", err, LogAddress(from)) return } m.deadNode(&d) } // handleUser is used to notify channels of incoming user data func (m *Memberlist) handleUser(buf []byte, from net.Addr) { d := m.config.Delegate if d != nil { d.NotifyMsg(buf) } } // handleCompressed is used to unpack a compressed message func (m *Memberlist) handleCompressed(buf []byte, from net.Addr, timestamp time.Time) { // Try to decode the payload payload, err := decompressPayload(buf) if err != nil { m.logger.Printf("[ERR] memberlist: Failed to decompress payload: %v %s", err, LogAddress(from)) return } // Recursively handle the payload m.handleCommand(payload, from, timestamp) } // encodeAndSendMsg is used to combine the encoding and sending steps func (m *Memberlist) encodeAndSendMsg(addr string, msgType messageType, msg interface{}) error { out, err := encode(msgType, msg) if err != nil { return err } if err := m.sendMsg(addr, out.Bytes()); err != nil { return err } return nil } // sendMsg is used to send a message via packet to another host. It will // opportunistically create a compoundMsg and piggy back other broadcasts. func (m *Memberlist) sendMsg(addr string, msg []byte) error { // Check if we can piggy back any messages bytesAvail := m.config.UDPBufferSize - len(msg) - compoundHeaderOverhead if m.config.EncryptionEnabled() { bytesAvail -= encryptOverhead(m.encryptionVersion()) } extra := m.getBroadcasts(compoundOverhead, bytesAvail) // Fast path if nothing to piggypack if len(extra) == 0 { return m.rawSendMsgPacket(addr, nil, msg) } // Join all the messages msgs := make([][]byte, 0, 1+len(extra)) msgs = append(msgs, msg) msgs = append(msgs, extra...) // Create a compound message compound := makeCompoundMessage(msgs) // Send the message return m.rawSendMsgPacket(addr, nil, compound.Bytes()) } // rawSendMsgPacket is used to send message via packet to another host without // modification, other than compression or encryption if enabled. func (m *Memberlist) rawSendMsgPacket(addr string, node *Node, msg []byte) error { // Check if we have compression enabled if m.config.EnableCompression { buf, err := compressPayload(msg) if err != nil { m.logger.Printf("[WARN] memberlist: Failed to compress payload: %v", err) } else { // Only use compression if it reduced the size if buf.Len() < len(msg) { msg = buf.Bytes() } } } // Try to look up the destination node if node == nil { toAddr, _, err := net.SplitHostPort(addr) if err != nil { m.logger.Printf("[ERR] memberlist: Failed to parse address %q: %v", addr, err) return err } m.nodeLock.RLock() nodeState, ok := m.nodeMap[toAddr] m.nodeLock.RUnlock() if ok { node = &nodeState.Node } } // Add a CRC to the end of the payload if the recipient understands // ProtocolVersion >= 5 if node != nil && node.PMax >= 5 { crc := crc32.ChecksumIEEE(msg) header := make([]byte, 5, 5+len(msg)) header[0] = byte(hasCrcMsg) binary.BigEndian.PutUint32(header[1:], crc) msg = append(header, msg...) } // Check if we have encryption enabled if m.config.EncryptionEnabled() { // Encrypt the payload var buf bytes.Buffer primaryKey := m.config.Keyring.GetPrimaryKey() err := encryptPayload(m.encryptionVersion(), primaryKey, msg, nil, &buf) if err != nil { m.logger.Printf("[ERR] memberlist: Encryption of message failed: %v", err) return err } msg = buf.Bytes() } metrics.IncrCounter([]string{"memberlist", "udp", "sent"}, float32(len(msg))) _, err := m.transport.WriteTo(msg, addr) return err } // rawSendMsgStream is used to stream a message to another host without // modification, other than applying compression and encryption if enabled. func (m *Memberlist) rawSendMsgStream(conn net.Conn, sendBuf []byte) error { // Check if compresion is enabled if m.config.EnableCompression { compBuf, err := compressPayload(sendBuf) if err != nil { m.logger.Printf("[ERROR] memberlist: Failed to compress payload: %v", err) } else { sendBuf = compBuf.Bytes() } } // Check if encryption is enabled if m.config.EncryptionEnabled() { crypt, err := m.encryptLocalState(sendBuf) if err != nil { m.logger.Printf("[ERROR] memberlist: Failed to encrypt local state: %v", err) return err } sendBuf = crypt } // Write out the entire send buffer metrics.IncrCounter([]string{"memberlist", "tcp", "sent"}, float32(len(sendBuf))) if n, err := conn.Write(sendBuf); err != nil { return err } else if n != len(sendBuf) { return fmt.Errorf("only %d of %d bytes written", n, len(sendBuf)) } return nil } // sendUserMsg is used to stream a user message to another host. func (m *Memberlist) sendUserMsg(addr string, sendBuf []byte) error { conn, err := m.transport.DialTimeout(addr, m.config.TCPTimeout) if err != nil { return err } defer conn.Close() bufConn := bytes.NewBuffer(nil) if err := bufConn.WriteByte(byte(userMsg)); err != nil { return err } header := userMsgHeader{UserMsgLen: len(sendBuf)} hd := codec.MsgpackHandle{} enc := codec.NewEncoder(bufConn, &hd) if err := enc.Encode(&header); err != nil { return err } if _, err := bufConn.Write(sendBuf); err != nil { return err } return m.rawSendMsgStream(conn, bufConn.Bytes()) } // sendAndReceiveState is used to initiate a push/pull over a stream with a // remote host. func (m *Memberlist) sendAndReceiveState(addr string, join bool) ([]pushNodeState, []byte, error) { // Attempt to connect conn, err := m.transport.DialTimeout(addr, m.config.TCPTimeout) if err != nil { return nil, nil, err } defer conn.Close() m.logger.Printf("[DEBUG] memberlist: Initiating push/pull sync with: %s", conn.RemoteAddr()) metrics.IncrCounter([]string{"memberlist", "tcp", "connect"}, 1) // Send our state if err := m.sendLocalState(conn, join); err != nil { return nil, nil, err } conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) msgType, bufConn, dec, err := m.readStream(conn) if err != nil { return nil, nil, err } // Quit if not push/pull if msgType != pushPullMsg { err := fmt.Errorf("received invalid msgType (%d), expected pushPullMsg (%d) %s", msgType, pushPullMsg, LogConn(conn)) return nil, nil, err } // Read remote state _, remoteNodes, userState, err := m.readRemoteState(bufConn, dec) return remoteNodes, userState, err } // sendLocalState is invoked to send our local state over a stream connection. func (m *Memberlist) sendLocalState(conn net.Conn, join bool) error { // Setup a deadline conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) // Prepare the local node state m.nodeLock.RLock() localNodes := make([]pushNodeState, len(m.nodes)) for idx, n := range m.nodes { localNodes[idx].Name = n.Name localNodes[idx].Addr = n.Addr localNodes[idx].Port = n.Port localNodes[idx].Incarnation = n.Incarnation localNodes[idx].State = n.State localNodes[idx].Meta = n.Meta localNodes[idx].Vsn = []uint8{ n.PMin, n.PMax, n.PCur, n.DMin, n.DMax, n.DCur, } } m.nodeLock.RUnlock() // Get the delegate state var userData []byte if m.config.Delegate != nil { userData = m.config.Delegate.LocalState(join) } // Create a bytes buffer writer bufConn := bytes.NewBuffer(nil) // Send our node state header := pushPullHeader{Nodes: len(localNodes), UserStateLen: len(userData), Join: join} hd := codec.MsgpackHandle{} enc := codec.NewEncoder(bufConn, &hd) // Begin state push if _, err := bufConn.Write([]byte{byte(pushPullMsg)}); err != nil { return err } if err := enc.Encode(&header); err != nil { return err } for i := 0; i < header.Nodes; i++ { if err := enc.Encode(&localNodes[i]); err != nil { return err } } // Write the user state as well if userData != nil { if _, err := bufConn.Write(userData); err != nil { return err } } // Get the send buffer return m.rawSendMsgStream(conn, bufConn.Bytes()) } // encryptLocalState is used to help encrypt local state before sending func (m *Memberlist) encryptLocalState(sendBuf []byte) ([]byte, error) { var buf bytes.Buffer // Write the encryptMsg byte buf.WriteByte(byte(encryptMsg)) // Write the size of the message sizeBuf := make([]byte, 4) encVsn := m.encryptionVersion() encLen := encryptedLength(encVsn, len(sendBuf)) binary.BigEndian.PutUint32(sizeBuf, uint32(encLen)) buf.Write(sizeBuf) // Write the encrypted cipher text to the buffer key := m.config.Keyring.GetPrimaryKey() err := encryptPayload(encVsn, key, sendBuf, buf.Bytes()[:5], &buf) if err != nil { return nil, err } return buf.Bytes(), nil } // decryptRemoteState is used to help decrypt the remote state func (m *Memberlist) decryptRemoteState(bufConn io.Reader) ([]byte, error) { // Read in enough to determine message length cipherText := bytes.NewBuffer(nil) cipherText.WriteByte(byte(encryptMsg)) _, err := io.CopyN(cipherText, bufConn, 4) if err != nil { return nil, err } // Ensure we aren't asked to download too much. This is to guard against // an attack vector where a huge amount of state is sent moreBytes := binary.BigEndian.Uint32(cipherText.Bytes()[1:5]) if moreBytes > maxPushStateBytes { return nil, fmt.Errorf("Remote node state is larger than limit (%d)", moreBytes) } // Read in the rest of the payload _, err = io.CopyN(cipherText, bufConn, int64(moreBytes)) if err != nil { return nil, err } // Decrypt the cipherText dataBytes := cipherText.Bytes()[:5] cipherBytes := cipherText.Bytes()[5:] // Decrypt the payload keys := m.config.Keyring.GetKeys() return decryptPayload(keys, cipherBytes, dataBytes) } // readStream is used to read from a stream connection, decrypting and // decompressing the stream if necessary. func (m *Memberlist) readStream(conn net.Conn) (messageType, io.Reader, *codec.Decoder, error) { // Created a buffered reader var bufConn io.Reader = bufio.NewReader(conn) // Read the message type buf := [1]byte{0} if _, err := bufConn.Read(buf[:]); err != nil { return 0, nil, nil, err } msgType := messageType(buf[0]) // Check if the message is encrypted if msgType == encryptMsg { if !m.config.EncryptionEnabled() { return 0, nil, nil, fmt.Errorf("Remote state is encrypted and encryption is not configured") } plain, err := m.decryptRemoteState(bufConn) if err != nil { return 0, nil, nil, err } // Reset message type and bufConn msgType = messageType(plain[0]) bufConn = bytes.NewReader(plain[1:]) } else if m.config.EncryptionEnabled() { return 0, nil, nil, fmt.Errorf("Encryption is configured but remote state is not encrypted") } // Get the msgPack decoders hd := codec.MsgpackHandle{} dec := codec.NewDecoder(bufConn, &hd) // Check if we have a compressed message if msgType == compressMsg { var c compress if err := dec.Decode(&c); err != nil { return 0, nil, nil, err } decomp, err := decompressBuffer(&c) if err != nil { return 0, nil, nil, err } // Reset the message type msgType = messageType(decomp[0]) // Create a new bufConn bufConn = bytes.NewReader(decomp[1:]) // Create a new decoder dec = codec.NewDecoder(bufConn, &hd) } return msgType, bufConn, dec, nil } // readRemoteState is used to read the remote state from a connection func (m *Memberlist) readRemoteState(bufConn io.Reader, dec *codec.Decoder) (bool, []pushNodeState, []byte, error) { // Read the push/pull header var header pushPullHeader if err := dec.Decode(&header); err != nil { return false, nil, nil, err } // Allocate space for the transfer remoteNodes := make([]pushNodeState, header.Nodes) // Try to decode all the states for i := 0; i < header.Nodes; i++ { if err := dec.Decode(&remoteNodes[i]); err != nil { return false, nil, nil, err } } // Read the remote user state into a buffer var userBuf []byte if header.UserStateLen > 0 { userBuf = make([]byte, header.UserStateLen) bytes, err := io.ReadAtLeast(bufConn, userBuf, header.UserStateLen) if err == nil && bytes != header.UserStateLen { err = fmt.Errorf( "Failed to read full user state (%d / %d)", bytes, header.UserStateLen) } if err != nil { return false, nil, nil, err } } // For proto versions < 2, there is no port provided. Mask old // behavior by using the configured port for idx := range remoteNodes { if m.ProtocolVersion() < 2 || remoteNodes[idx].Port == 0 { remoteNodes[idx].Port = uint16(m.config.BindPort) } } return header.Join, remoteNodes, userBuf, nil } // mergeRemoteState is used to merge the remote state with our local state func (m *Memberlist) mergeRemoteState(join bool, remoteNodes []pushNodeState, userBuf []byte) error { if err := m.verifyProtocol(remoteNodes); err != nil { return err } // Invoke the merge delegate if any if join && m.config.Merge != nil { nodes := make([]*Node, len(remoteNodes)) for idx, n := range remoteNodes { nodes[idx] = &Node{ Name: n.Name, Addr: n.Addr, Port: n.Port, Meta: n.Meta, PMin: n.Vsn[0], PMax: n.Vsn[1], PCur: n.Vsn[2], DMin: n.Vsn[3], DMax: n.Vsn[4], DCur: n.Vsn[5], } } if err := m.config.Merge.NotifyMerge(nodes); err != nil { return err } } // Merge the membership state m.mergeState(remoteNodes) // Invoke the delegate for user state if userBuf != nil && m.config.Delegate != nil { m.config.Delegate.MergeRemoteState(userBuf, join) } return nil } // readUserMsg is used to decode a userMsg from a stream. func (m *Memberlist) readUserMsg(bufConn io.Reader, dec *codec.Decoder) error { // Read the user message header var header userMsgHeader if err := dec.Decode(&header); err != nil { return err } // Read the user message into a buffer var userBuf []byte if header.UserMsgLen > 0 { userBuf = make([]byte, header.UserMsgLen) bytes, err := io.ReadAtLeast(bufConn, userBuf, header.UserMsgLen) if err == nil && bytes != header.UserMsgLen { err = fmt.Errorf( "Failed to read full user message (%d / %d)", bytes, header.UserMsgLen) } if err != nil { return err } d := m.config.Delegate if d != nil { d.NotifyMsg(userBuf) } } return nil } // sendPingAndWaitForAck makes a stream connection to the given address, sends // a ping, and waits for an ack. All of this is done as a series of blocking // operations, given the deadline. The bool return parameter is true if we // we able to round trip a ping to the other node. func (m *Memberlist) sendPingAndWaitForAck(addr string, ping ping, deadline time.Time) (bool, error) { conn, err := m.transport.DialTimeout(addr, m.config.TCPTimeout) if err != nil { // If the node is actually dead we expect this to fail, so we // shouldn't spam the logs with it. After this point, errors // with the connection are real, unexpected errors and should // get propagated up. return false, nil } defer conn.Close() conn.SetDeadline(deadline) out, err := encode(pingMsg, &ping) if err != nil { return false, err } if err = m.rawSendMsgStream(conn, out.Bytes()); err != nil { return false, err } msgType, _, dec, err := m.readStream(conn) if err != nil { return false, err } if msgType != ackRespMsg { return false, fmt.Errorf("Unexpected msgType (%d) from ping %s", msgType, LogConn(conn)) } var ack ackResp if err = dec.Decode(&ack); err != nil { return false, err } if ack.SeqNo != ping.SeqNo { return false, fmt.Errorf("Sequence number from ack (%d) doesn't match ping (%d)", ack.SeqNo, ping.SeqNo, LogConn(conn)) } return true, nil } memberlist-0.1.0/net_test.go000066400000000000000000000426031307374264600160500ustar00rootroot00000000000000package memberlist import ( "bytes" "encoding/binary" "fmt" "io" "log" "net" "reflect" "strings" "testing" "time" "github.com/hashicorp/go-msgpack/codec" ) // As a regression we left this test very low-level and network-ey, even after // we abstracted the transport. We added some basic network-free transport tests // in transport_test.go to prove that we didn't hard code some network stuff // outside of NetTransport. func TestHandleCompoundPing(t *testing.T) { m := GetMemberlist(t) m.config.EnableCompression = false defer m.Shutdown() var udp *net.UDPConn for port := 60000; port < 61000; port++ { udpAddr := fmt.Sprintf("127.0.0.1:%d", port) udpLn, err := net.ListenPacket("udp", udpAddr) if err == nil { udp = udpLn.(*net.UDPConn) break } } if udp == nil { t.Fatalf("no udp listener") } // Encode a ping ping := ping{SeqNo: 42} buf, err := encode(pingMsg, ping) if err != nil { t.Fatalf("unexpected err %s", err) } // Make a compound message compound := makeCompoundMessage([][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()}) // Send compound version addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} udp.WriteTo(compound.Bytes(), addr) // Wait for responses doneCh := make(chan struct{}, 1) go func() { select { case <-doneCh: case <-time.After(2 * time.Second): panic("timeout") } }() for i := 0; i < 3; i++ { in := make([]byte, 1500) n, _, err := udp.ReadFrom(in) if err != nil { t.Fatalf("unexpected err %s", err) } in = in[0:n] msgType := messageType(in[0]) if msgType != ackRespMsg { t.Fatalf("bad response %v", in) } var ack ackResp if err := decode(in[1:], &ack); err != nil { t.Fatalf("unexpected err %s", err) } if ack.SeqNo != 42 { t.Fatalf("bad sequence no") } } doneCh <- struct{}{} } func TestHandlePing(t *testing.T) { m := GetMemberlist(t) m.config.EnableCompression = false defer m.Shutdown() var udp *net.UDPConn for port := 60000; port < 61000; port++ { udpAddr := fmt.Sprintf("127.0.0.1:%d", port) udpLn, err := net.ListenPacket("udp", udpAddr) if err == nil { udp = udpLn.(*net.UDPConn) break } } if udp == nil { t.Fatalf("no udp listener") } // Encode a ping ping := ping{SeqNo: 42} buf, err := encode(pingMsg, ping) if err != nil { t.Fatalf("unexpected err %s", err) } // Send addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} udp.WriteTo(buf.Bytes(), addr) // Wait for response doneCh := make(chan struct{}, 1) go func() { select { case <-doneCh: case <-time.After(2 * time.Second): panic("timeout") } }() in := make([]byte, 1500) n, _, err := udp.ReadFrom(in) if err != nil { t.Fatalf("unexpected err %s", err) } in = in[0:n] msgType := messageType(in[0]) if msgType != ackRespMsg { t.Fatalf("bad response %v", in) } var ack ackResp if err := decode(in[1:], &ack); err != nil { t.Fatalf("unexpected err %s", err) } if ack.SeqNo != 42 { t.Fatalf("bad sequence no") } doneCh <- struct{}{} } func TestHandlePing_WrongNode(t *testing.T) { m := GetMemberlist(t) m.config.EnableCompression = false defer m.Shutdown() var udp *net.UDPConn for port := 60000; port < 61000; port++ { udpAddr := fmt.Sprintf("127.0.0.1:%d", port) udpLn, err := net.ListenPacket("udp", udpAddr) if err == nil { udp = udpLn.(*net.UDPConn) break } } if udp == nil { t.Fatalf("no udp listener") } // Encode a ping, wrong node! ping := ping{SeqNo: 42, Node: m.config.Name + "-bad"} buf, err := encode(pingMsg, ping) if err != nil { t.Fatalf("unexpected err %s", err) } // Send addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} udp.WriteTo(buf.Bytes(), addr) // Wait for response udp.SetDeadline(time.Now().Add(50 * time.Millisecond)) in := make([]byte, 1500) _, _, err = udp.ReadFrom(in) // Should get an i/o timeout if err == nil { t.Fatalf("expected err %s", err) } } func TestHandleIndirectPing(t *testing.T) { m := GetMemberlist(t) m.config.EnableCompression = false defer m.Shutdown() var udp *net.UDPConn for port := 60000; port < 61000; port++ { udpAddr := fmt.Sprintf("127.0.0.1:%d", port) udpLn, err := net.ListenPacket("udp", udpAddr) if err == nil { udp = udpLn.(*net.UDPConn) break } } if udp == nil { t.Fatalf("no udp listener") } // Encode an indirect ping ind := indirectPingReq{ SeqNo: 100, Target: net.ParseIP(m.config.BindAddr), Port: uint16(m.config.BindPort), } buf, err := encode(indirectPingMsg, &ind) if err != nil { t.Fatalf("unexpected err %s", err) } // Send addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} udp.WriteTo(buf.Bytes(), addr) // Wait for response doneCh := make(chan struct{}, 1) go func() { select { case <-doneCh: case <-time.After(2 * time.Second): panic("timeout") } }() in := make([]byte, 1500) n, _, err := udp.ReadFrom(in) if err != nil { t.Fatalf("unexpected err %s", err) } in = in[0:n] msgType := messageType(in[0]) if msgType != ackRespMsg { t.Fatalf("bad response %v", in) } var ack ackResp if err := decode(in[1:], &ack); err != nil { t.Fatalf("unexpected err %s", err) } if ack.SeqNo != 100 { t.Fatalf("bad sequence no") } doneCh <- struct{}{} } func TestTCPPing(t *testing.T) { var tcp *net.TCPListener var tcpAddr *net.TCPAddr for port := 60000; port < 61000; port++ { tcpAddr = &net.TCPAddr{IP: net.ParseIP("127.0.0.1"), Port: port} tcpLn, err := net.ListenTCP("tcp", tcpAddr) if err == nil { tcp = tcpLn break } } if tcp == nil { t.Fatalf("no tcp listener") } // Note that tcp gets closed in the last test, so we avoid a deferred // Close() call here. m := GetMemberlist(t) defer m.Shutdown() pingTimeout := m.config.ProbeInterval pingTimeMax := m.config.ProbeInterval + 10*time.Millisecond // Do a normal round trip. pingOut := ping{SeqNo: 23, Node: "mongo"} go func() { tcp.SetDeadline(time.Now().Add(pingTimeMax)) conn, err := tcp.AcceptTCP() if err != nil { t.Fatalf("failed to connect: %s", err) } defer conn.Close() msgType, _, dec, err := m.readStream(conn) if err != nil { t.Fatalf("failed to read ping: %s", err) } if msgType != pingMsg { t.Fatalf("expecting ping, got message type (%d)", msgType) } var pingIn ping if err := dec.Decode(&pingIn); err != nil { t.Fatalf("failed to decode ping: %s", err) } if pingIn.SeqNo != pingOut.SeqNo { t.Fatalf("sequence number isn't correct (%d) vs (%d)", pingIn.SeqNo, pingOut.SeqNo) } if pingIn.Node != pingOut.Node { t.Fatalf("node name isn't correct (%s) vs (%s)", pingIn.Node, pingOut.Node) } ack := ackResp{pingIn.SeqNo, nil} out, err := encode(ackRespMsg, &ack) if err != nil { t.Fatalf("failed to encode ack: %s", err) } err = m.rawSendMsgStream(conn, out.Bytes()) if err != nil { t.Fatalf("failed to send ack: %s", err) } }() deadline := time.Now().Add(pingTimeout) didContact, err := m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) if err != nil { t.Fatalf("error trying to ping: %s", err) } if !didContact { t.Fatalf("expected successful ping") } // Make sure a mis-matched sequence number is caught. go func() { tcp.SetDeadline(time.Now().Add(pingTimeMax)) conn, err := tcp.AcceptTCP() if err != nil { t.Fatalf("failed to connect: %s", err) } defer conn.Close() _, _, dec, err := m.readStream(conn) if err != nil { t.Fatalf("failed to read ping: %s", err) } var pingIn ping if err := dec.Decode(&pingIn); err != nil { t.Fatalf("failed to decode ping: %s", err) } ack := ackResp{pingIn.SeqNo + 1, nil} out, err := encode(ackRespMsg, &ack) if err != nil { t.Fatalf("failed to encode ack: %s", err) } err = m.rawSendMsgStream(conn, out.Bytes()) if err != nil { t.Fatalf("failed to send ack: %s", err) } }() deadline = time.Now().Add(pingTimeout) didContact, err = m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) if err == nil || !strings.Contains(err.Error(), "Sequence number") { t.Fatalf("expected an error from mis-matched sequence number") } if didContact { t.Fatalf("expected failed ping") } // Make sure an unexpected message type is handled gracefully. go func() { tcp.SetDeadline(time.Now().Add(pingTimeMax)) conn, err := tcp.AcceptTCP() if err != nil { t.Fatalf("failed to connect: %s", err) } defer conn.Close() _, _, _, err = m.readStream(conn) if err != nil { t.Fatalf("failed to read ping: %s", err) } bogus := indirectPingReq{} out, err := encode(indirectPingMsg, &bogus) if err != nil { t.Fatalf("failed to encode bogus msg: %s", err) } err = m.rawSendMsgStream(conn, out.Bytes()) if err != nil { t.Fatalf("failed to send bogus msg: %s", err) } }() deadline = time.Now().Add(pingTimeout) didContact, err = m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) if err == nil || !strings.Contains(err.Error(), "Unexpected msgType") { t.Fatalf("expected an error from bogus message") } if didContact { t.Fatalf("expected failed ping") } // Make sure failed I/O respects the deadline. In this case we try the // common case of the receiving node being totally down. tcp.Close() deadline = time.Now().Add(pingTimeout) startPing := time.Now() didContact, err = m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) pingTime := time.Now().Sub(startPing) if err != nil { t.Fatalf("expected no error during ping on closed socket, got: %s", err) } if didContact { t.Fatalf("expected failed ping") } if pingTime > pingTimeMax { t.Fatalf("took too long to fail ping, %9.6f", pingTime.Seconds()) } } func TestTCPPushPull(t *testing.T) { m := GetMemberlist(t) defer m.Shutdown() m.nodes = append(m.nodes, &nodeState{ Node: Node{ Name: "Test 0", Addr: net.ParseIP(m.config.BindAddr), Port: uint16(m.config.BindPort), }, Incarnation: 0, State: stateSuspect, StateChange: time.Now().Add(-1 * time.Second), }) addr := fmt.Sprintf("%s:%d", m.config.BindAddr, m.config.BindPort) conn, err := net.Dial("tcp", addr) if err != nil { t.Fatalf("unexpected err %s", err) } defer conn.Close() localNodes := make([]pushNodeState, 3) localNodes[0].Name = "Test 0" localNodes[0].Addr = net.ParseIP(m.config.BindAddr) localNodes[0].Port = uint16(m.config.BindPort) localNodes[0].Incarnation = 1 localNodes[0].State = stateAlive localNodes[1].Name = "Test 1" localNodes[1].Addr = net.ParseIP(m.config.BindAddr) localNodes[1].Port = uint16(m.config.BindPort) localNodes[1].Incarnation = 1 localNodes[1].State = stateAlive localNodes[2].Name = "Test 2" localNodes[2].Addr = net.ParseIP(m.config.BindAddr) localNodes[2].Port = uint16(m.config.BindPort) localNodes[2].Incarnation = 1 localNodes[2].State = stateAlive // Send our node state header := pushPullHeader{Nodes: 3} hd := codec.MsgpackHandle{} enc := codec.NewEncoder(conn, &hd) // Send the push/pull indicator conn.Write([]byte{byte(pushPullMsg)}) if err := enc.Encode(&header); err != nil { t.Fatalf("unexpected err %s", err) } for i := 0; i < header.Nodes; i++ { if err := enc.Encode(&localNodes[i]); err != nil { t.Fatalf("unexpected err %s", err) } } // Read the message type var msgType messageType if err := binary.Read(conn, binary.BigEndian, &msgType); err != nil { t.Fatalf("unexpected err %s", err) } var bufConn io.Reader = conn msghd := codec.MsgpackHandle{} dec := codec.NewDecoder(bufConn, &msghd) // Check if we have a compressed message if msgType == compressMsg { var c compress if err := dec.Decode(&c); err != nil { t.Fatalf("unexpected err %s", err) } decomp, err := decompressBuffer(&c) if err != nil { t.Fatalf("unexpected err %s", err) } // Reset the message type msgType = messageType(decomp[0]) // Create a new bufConn bufConn = bytes.NewReader(decomp[1:]) // Create a new decoder dec = codec.NewDecoder(bufConn, &hd) } // Quit if not push/pull if msgType != pushPullMsg { t.Fatalf("bad message type") } if err := dec.Decode(&header); err != nil { t.Fatalf("unexpected err %s", err) } // Allocate space for the transfer remoteNodes := make([]pushNodeState, header.Nodes) // Try to decode all the states for i := 0; i < header.Nodes; i++ { if err := dec.Decode(&remoteNodes[i]); err != nil { t.Fatalf("unexpected err %s", err) } } if len(remoteNodes) != 1 { t.Fatalf("bad response") } n := &remoteNodes[0] if n.Name != "Test 0" { t.Fatalf("bad name") } if bytes.Compare(n.Addr, net.ParseIP(m.config.BindAddr)) != 0 { t.Fatal("bad addr") } if n.Incarnation != 0 { t.Fatal("bad incarnation") } if n.State != stateSuspect { t.Fatal("bad state") } } func TestSendMsg_Piggyback(t *testing.T) { m := GetMemberlist(t) defer m.Shutdown() // Add a message to be broadcast a := alive{ Incarnation: 10, Node: "rand", Addr: []byte{127, 0, 0, 255}, Meta: nil, } m.encodeAndBroadcast("rand", aliveMsg, &a) var udp *net.UDPConn for port := 60000; port < 61000; port++ { udpAddr := fmt.Sprintf("127.0.0.1:%d", port) udpLn, err := net.ListenPacket("udp", udpAddr) if err == nil { udp = udpLn.(*net.UDPConn) break } } // Encode a ping ping := ping{SeqNo: 42} buf, err := encode(pingMsg, ping) if err != nil { t.Fatalf("unexpected err %s", err) } // Send addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} udp.WriteTo(buf.Bytes(), addr) // Wait for response doneCh := make(chan struct{}, 1) go func() { select { case <-doneCh: case <-time.After(2 * time.Second): panic("timeout") } }() in := make([]byte, 1500) n, _, err := udp.ReadFrom(in) if err != nil { t.Fatalf("unexpected err %s", err) } in = in[0:n] msgType := messageType(in[0]) if msgType != compoundMsg { t.Fatalf("bad response %v", in) } // get the parts trunc, parts, err := decodeCompoundMessage(in[1:]) if trunc != 0 { t.Fatalf("unexpected truncation") } if len(parts) != 2 { t.Fatalf("unexpected parts %v", parts) } if err != nil { t.Fatalf("unexpected err %s", err) } var ack ackResp if err := decode(parts[0][1:], &ack); err != nil { t.Fatalf("unexpected err %s", err) } if ack.SeqNo != 42 { t.Fatalf("bad sequence no") } var aliveout alive if err := decode(parts[1][1:], &aliveout); err != nil { t.Fatalf("unexpected err %s", err) } if aliveout.Node != "rand" || aliveout.Incarnation != 10 { t.Fatalf("bad mesg") } doneCh <- struct{}{} } func TestEncryptDecryptState(t *testing.T) { state := []byte("this is our internal state...") config := &Config{ SecretKey: []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, ProtocolVersion: ProtocolVersionMax, } m, err := Create(config) if err != nil { t.Fatalf("err: %s", err) } defer m.Shutdown() crypt, err := m.encryptLocalState(state) if err != nil { t.Fatalf("err: %v", err) } // Create reader, seek past the type byte buf := bytes.NewReader(crypt) buf.Seek(1, 0) plain, err := m.decryptRemoteState(buf) if err != nil { t.Fatalf("err: %v", err) } if !reflect.DeepEqual(state, plain) { t.Fatalf("Decrypt failed: %v", plain) } } func TestRawSendUdp_CRC(t *testing.T) { m := GetMemberlist(t) m.config.EnableCompression = false defer m.Shutdown() var udp *net.UDPConn for port := 60000; port < 61000; port++ { udpAddr := fmt.Sprintf("127.0.0.1:%d", port) udpLn, err := net.ListenPacket("udp", udpAddr) if err == nil { udp = udpLn.(*net.UDPConn) break } } if udp == nil { t.Fatalf("no udp listener") } // Pass a nil node with no nodes registered, should result in no checksum payload := []byte{3, 3, 3, 3} m.rawSendMsgPacket(udp.LocalAddr().String(), nil, payload) in := make([]byte, 1500) n, _, err := udp.ReadFrom(in) if err != nil { t.Fatalf("unexpected err %s", err) } in = in[0:n] if len(in) != 4 { t.Fatalf("bad: %v", in) } // Pass a non-nil node with PMax >= 5, should result in a checksum m.rawSendMsgPacket(udp.LocalAddr().String(), &Node{PMax: 5}, payload) in = make([]byte, 1500) n, _, err = udp.ReadFrom(in) if err != nil { t.Fatalf("unexpected err %s", err) } in = in[0:n] if len(in) != 9 { t.Fatalf("bad: %v", in) } // Register a node with PMax >= 5 to be looked up, should result in a checksum m.nodeMap["127.0.0.1"] = &nodeState{ Node: Node{PMax: 5}, } m.rawSendMsgPacket(udp.LocalAddr().String(), nil, payload) in = make([]byte, 1500) n, _, err = udp.ReadFrom(in) if err != nil { t.Fatalf("unexpected err %s", err) } in = in[0:n] if len(in) != 9 { t.Fatalf("bad: %v", in) } } func TestIngestPacket_CRC(t *testing.T) { m := GetMemberlist(t) m.config.EnableCompression = false defer m.Shutdown() var udp *net.UDPConn for port := 60000; port < 61000; port++ { udpAddr := fmt.Sprintf("127.0.0.1:%d", port) udpLn, err := net.ListenPacket("udp", udpAddr) if err == nil { udp = udpLn.(*net.UDPConn) break } } if udp == nil { t.Fatalf("no udp listener") } // Get a message with a checksum payload := []byte{3, 3, 3, 3} m.rawSendMsgPacket(udp.LocalAddr().String(), &Node{PMax: 5}, payload) in := make([]byte, 1500) n, _, err := udp.ReadFrom(in) if err != nil { t.Fatalf("unexpected err %s", err) } in = in[0:n] if len(in) != 9 { t.Fatalf("bad: %v", in) } // Corrupt the checksum in[1] <<= 1 logs := &bytes.Buffer{} logger := log.New(logs, "", 0) m.logger = logger m.ingestPacket(in, udp.LocalAddr(), time.Now()) if !strings.Contains(logs.String(), "invalid checksum") { t.Fatalf("bad: %s", logs.String()) } } memberlist-0.1.0/net_transport.go000066400000000000000000000174731307374264600171340ustar00rootroot00000000000000package memberlist import ( "fmt" "log" "net" "sync" "sync/atomic" "time" "github.com/armon/go-metrics" sockaddr "github.com/hashicorp/go-sockaddr" ) const ( // udpPacketBufSize is used to buffer incoming packets during read // operations. udpPacketBufSize = 65536 // udpRecvBufSize is a large buffer size that we attempt to set UDP // sockets to in order to handle a large volume of messages. udpRecvBufSize = 2 * 1024 * 1024 ) // NetTransportConfig is used to configure a net transport. type NetTransportConfig struct { // BindAddrs is a list of addresses to bind to for both TCP and UDP // communications. BindAddrs []string // BindPort is the port to listen on, for each address above. BindPort int // Logger is a logger for operator messages. Logger *log.Logger } // NetTransport is a Transport implementation that uses connectionless UDP for // packet operations, and ad-hoc TCP connections for stream operations. type NetTransport struct { config *NetTransportConfig packetCh chan *Packet streamCh chan net.Conn logger *log.Logger wg sync.WaitGroup tcpListeners []*net.TCPListener udpListeners []*net.UDPConn shutdown int32 } // NewNetTransport returns a net transport with the given configuration. On // success all the network listeners will be created and listening. func NewNetTransport(config *NetTransportConfig) (*NetTransport, error) { // If we reject the empty list outright we can assume that there's at // least one listener of each type later during operation. if len(config.BindAddrs) == 0 { return nil, fmt.Errorf("At least one bind address is required") } // Build out the new transport. var ok bool t := NetTransport{ config: config, packetCh: make(chan *Packet), streamCh: make(chan net.Conn), logger: config.Logger, } // Clean up listeners if there's an error. defer func() { if !ok { t.Shutdown() } }() // Build all the TCP and UDP listeners. port := config.BindPort for _, addr := range config.BindAddrs { ip := net.ParseIP(addr) tcpAddr := &net.TCPAddr{IP: ip, Port: port} tcpLn, err := net.ListenTCP("tcp", tcpAddr) if err != nil { return nil, fmt.Errorf("Failed to start TCP listener on %q port %d: %v", addr, port, err) } t.tcpListeners = append(t.tcpListeners, tcpLn) // If the config port given was zero, use the first TCP listener // to pick an available port and then apply that to everything // else. if port == 0 { port = tcpLn.Addr().(*net.TCPAddr).Port } udpAddr := &net.UDPAddr{IP: ip, Port: port} udpLn, err := net.ListenUDP("udp", udpAddr) if err != nil { return nil, fmt.Errorf("Failed to start UDP listener on %q port %d: %v", addr, port, err) } if err := setUDPRecvBuf(udpLn); err != nil { return nil, fmt.Errorf("Failed to resize UDP buffer: %v", err) } t.udpListeners = append(t.udpListeners, udpLn) } // Fire them up now that we've been able to create them all. for i := 0; i < len(config.BindAddrs); i++ { t.wg.Add(2) go t.tcpListen(t.tcpListeners[i]) go t.udpListen(t.udpListeners[i]) } ok = true return &t, nil } // GetAutoBindPort returns the bind port that was automatically given by the // kernel, if a bind port of 0 was given. func (t *NetTransport) GetAutoBindPort() int { // We made sure there's at least one TCP listener, and that one's // port was applied to all the others for the dynamic bind case. return t.tcpListeners[0].Addr().(*net.TCPAddr).Port } // See Transport. func (t *NetTransport) FinalAdvertiseAddr(ip string, port int) (net.IP, int, error) { var advertiseAddr net.IP var advertisePort int if ip != "" { // If they've supplied an address, use that. advertiseAddr = net.ParseIP(ip) if advertiseAddr == nil { return nil, 0, fmt.Errorf("Failed to parse advertise address %q", ip) } // Ensure IPv4 conversion if necessary. if ip4 := advertiseAddr.To4(); ip4 != nil { advertiseAddr = ip4 } advertisePort = port } else { if t.config.BindAddrs[0] == "0.0.0.0" { // Otherwise, if we're not bound to a specific IP, let's // use a suitable private IP address. var err error ip, err = sockaddr.GetPrivateIP() if err != nil { return nil, 0, fmt.Errorf("Failed to get interface addresses: %v", err) } if ip == "" { return nil, 0, fmt.Errorf("No private IP address found, and explicit IP not provided") } advertiseAddr = net.ParseIP(ip) if advertiseAddr == nil { return nil, 0, fmt.Errorf("Failed to parse advertise address: %q", ip) } } else { // Use the IP that we're bound to, based on the first // TCP listener, which we already ensure is there. advertiseAddr = t.tcpListeners[0].Addr().(*net.TCPAddr).IP } // Use the port we are bound to. advertisePort = t.GetAutoBindPort() } return advertiseAddr, advertisePort, nil } // See Transport. func (t *NetTransport) WriteTo(b []byte, addr string) (time.Time, error) { udpAddr, err := net.ResolveUDPAddr("udp", addr) if err != nil { return time.Time{}, err } // We made sure there's at least one UDP listener, so just use the // packet sending interface on the first one. Take the time after the // write call comes back, which will underestimate the time a little, // but help account for any delays before the write occurs. _, err = t.udpListeners[0].WriteTo(b, udpAddr) return time.Now(), err } // See Transport. func (t *NetTransport) PacketCh() <-chan *Packet { return t.packetCh } // See Transport. func (t *NetTransport) DialTimeout(addr string, timeout time.Duration) (net.Conn, error) { dialer := net.Dialer{Timeout: timeout} return dialer.Dial("tcp", addr) } // See Transport. func (t *NetTransport) StreamCh() <-chan net.Conn { return t.streamCh } // See Transport. func (t *NetTransport) Shutdown() error { // This will avoid log spam about errors when we shut down. atomic.StoreInt32(&t.shutdown, 1) // Rip through all the connections and shut them down. for _, conn := range t.tcpListeners { conn.Close() } for _, conn := range t.udpListeners { conn.Close() } // Block until all the listener threads have died. t.wg.Wait() return nil } // tcpListen is a long running goroutine that accepts incoming TCP connections // and hands them off to the stream channel. func (t *NetTransport) tcpListen(tcpLn *net.TCPListener) { defer t.wg.Done() for { conn, err := tcpLn.AcceptTCP() if err != nil { if s := atomic.LoadInt32(&t.shutdown); s == 1 { break } t.logger.Printf("[ERR] memberlist: Error accepting TCP connection: %v", err) continue } t.streamCh <- conn } } // udpListen is a long running goroutine that accepts incoming UDP packets and // hands them off to the packet channel. func (t *NetTransport) udpListen(udpLn *net.UDPConn) { defer t.wg.Done() for { // Do a blocking read into a fresh buffer. Grab a time stamp as // close as possible to the I/O. buf := make([]byte, udpPacketBufSize) n, addr, err := udpLn.ReadFrom(buf) ts := time.Now() if err != nil { if s := atomic.LoadInt32(&t.shutdown); s == 1 { break } t.logger.Printf("[ERR] memberlist: Error reading UDP packet: %v", err) continue } // Check the length - it needs to have at least one byte to be a // proper message. if n < 1 { t.logger.Printf("[ERR] memberlist: UDP packet too short (%d bytes) %s", len(buf), LogAddress(addr)) continue } // Ingest the packet. metrics.IncrCounter([]string{"memberlist", "udp", "received"}, float32(n)) t.packetCh <- &Packet{ Buf: buf[:n], From: addr, Timestamp: ts, } } } // setUDPRecvBuf is used to resize the UDP receive window. The function // attempts to set the read buffer to `udpRecvBuf` but backs off until // the read buffer can be set. func setUDPRecvBuf(c *net.UDPConn) error { size := udpRecvBufSize var err error for size > 0 { if err = c.SetReadBuffer(size); err == nil { return nil } size = size / 2 } return err } memberlist-0.1.0/ping_delegate.go000066400000000000000000000012051307374264600170030ustar00rootroot00000000000000package memberlist import "time" // PingDelegate is used to notify an observer how long it took for a ping message to // complete a round trip. It can also be used for writing arbitrary byte slices // into ack messages. Note that in order to be meaningful for RTT estimates, this // delegate does not apply to indirect pings, nor fallback pings sent over TCP. type PingDelegate interface { // AckPayload is invoked when an ack is being sent; the returned bytes will be appended to the ack AckPayload() []byte // NotifyPing is invoked when an ack for a ping is received NotifyPingComplete(other *Node, rtt time.Duration, payload []byte) } memberlist-0.1.0/queue.go000066400000000000000000000077161307374264600153550ustar00rootroot00000000000000package memberlist import ( "sort" "sync" ) // TransmitLimitedQueue is used to queue messages to broadcast to // the cluster (via gossip) but limits the number of transmits per // message. It also prioritizes messages with lower transmit counts // (hence newer messages). type TransmitLimitedQueue struct { // NumNodes returns the number of nodes in the cluster. This is // used to determine the retransmit count, which is calculated // based on the log of this. NumNodes func() int // RetransmitMult is the multiplier used to determine the maximum // number of retransmissions attempted. RetransmitMult int sync.Mutex bcQueue limitedBroadcasts } type limitedBroadcast struct { transmits int // Number of transmissions attempted. b Broadcast } type limitedBroadcasts []*limitedBroadcast // Broadcast is something that can be broadcasted via gossip to // the memberlist cluster. type Broadcast interface { // Invalidates checks if enqueuing the current broadcast // invalidates a previous broadcast Invalidates(b Broadcast) bool // Returns a byte form of the message Message() []byte // Finished is invoked when the message will no longer // be broadcast, either due to invalidation or to the // transmit limit being reached Finished() } // QueueBroadcast is used to enqueue a broadcast func (q *TransmitLimitedQueue) QueueBroadcast(b Broadcast) { q.Lock() defer q.Unlock() // Check if this message invalidates another n := len(q.bcQueue) for i := 0; i < n; i++ { if b.Invalidates(q.bcQueue[i].b) { q.bcQueue[i].b.Finished() copy(q.bcQueue[i:], q.bcQueue[i+1:]) q.bcQueue[n-1] = nil q.bcQueue = q.bcQueue[:n-1] n-- } } // Append to the queue q.bcQueue = append(q.bcQueue, &limitedBroadcast{0, b}) } // GetBroadcasts is used to get a number of broadcasts, up to a byte limit // and applying a per-message overhead as provided. func (q *TransmitLimitedQueue) GetBroadcasts(overhead, limit int) [][]byte { q.Lock() defer q.Unlock() // Fast path the default case if len(q.bcQueue) == 0 { return nil } transmitLimit := retransmitLimit(q.RetransmitMult, q.NumNodes()) bytesUsed := 0 var toSend [][]byte for i := len(q.bcQueue) - 1; i >= 0; i-- { // Check if this is within our limits b := q.bcQueue[i] msg := b.b.Message() if bytesUsed+overhead+len(msg) > limit { continue } // Add to slice to send bytesUsed += overhead + len(msg) toSend = append(toSend, msg) // Check if we should stop transmission b.transmits++ if b.transmits >= transmitLimit { b.b.Finished() n := len(q.bcQueue) q.bcQueue[i], q.bcQueue[n-1] = q.bcQueue[n-1], nil q.bcQueue = q.bcQueue[:n-1] } } // If we are sending anything, we need to re-sort to deal // with adjusted transmit counts if len(toSend) > 0 { q.bcQueue.Sort() } return toSend } // NumQueued returns the number of queued messages func (q *TransmitLimitedQueue) NumQueued() int { q.Lock() defer q.Unlock() return len(q.bcQueue) } // Reset clears all the queued messages func (q *TransmitLimitedQueue) Reset() { q.Lock() defer q.Unlock() for _, b := range q.bcQueue { b.b.Finished() } q.bcQueue = nil } // Prune will retain the maxRetain latest messages, and the rest // will be discarded. This can be used to prevent unbounded queue sizes func (q *TransmitLimitedQueue) Prune(maxRetain int) { q.Lock() defer q.Unlock() // Do nothing if queue size is less than the limit n := len(q.bcQueue) if n < maxRetain { return } // Invalidate the messages we will be removing for i := 0; i < n-maxRetain; i++ { q.bcQueue[i].b.Finished() } // Move the messages, and retain only the last maxRetain copy(q.bcQueue[0:], q.bcQueue[n-maxRetain:]) q.bcQueue = q.bcQueue[:maxRetain] } func (b limitedBroadcasts) Len() int { return len(b) } func (b limitedBroadcasts) Less(i, j int) bool { return b[i].transmits < b[j].transmits } func (b limitedBroadcasts) Swap(i, j int) { b[i], b[j] = b[j], b[i] } func (b limitedBroadcasts) Sort() { sort.Sort(sort.Reverse(b)) } memberlist-0.1.0/queue_test.go000066400000000000000000000107041307374264600164030ustar00rootroot00000000000000package memberlist import ( "testing" ) func TestTransmitLimited_Queue(t *testing.T) { q := &TransmitLimitedQueue{RetransmitMult: 1, NumNodes: func() int { return 1 }} q.QueueBroadcast(&memberlistBroadcast{"test", nil, nil}) q.QueueBroadcast(&memberlistBroadcast{"foo", nil, nil}) q.QueueBroadcast(&memberlistBroadcast{"bar", nil, nil}) if len(q.bcQueue) != 3 { t.Fatalf("bad len") } if q.bcQueue[0].b.(*memberlistBroadcast).node != "test" { t.Fatalf("missing test") } if q.bcQueue[1].b.(*memberlistBroadcast).node != "foo" { t.Fatalf("missing foo") } if q.bcQueue[2].b.(*memberlistBroadcast).node != "bar" { t.Fatalf("missing bar") } // Should invalidate previous message q.QueueBroadcast(&memberlistBroadcast{"test", nil, nil}) if len(q.bcQueue) != 3 { t.Fatalf("bad len") } if q.bcQueue[0].b.(*memberlistBroadcast).node != "foo" { t.Fatalf("missing foo") } if q.bcQueue[1].b.(*memberlistBroadcast).node != "bar" { t.Fatalf("missing bar") } if q.bcQueue[2].b.(*memberlistBroadcast).node != "test" { t.Fatalf("missing test") } } func TestTransmitLimited_GetBroadcasts(t *testing.T) { q := &TransmitLimitedQueue{RetransmitMult: 3, NumNodes: func() int { return 10 }} // 18 bytes per message q.QueueBroadcast(&memberlistBroadcast{"test", []byte("1. this is a test."), nil}) q.QueueBroadcast(&memberlistBroadcast{"foo", []byte("2. this is a test."), nil}) q.QueueBroadcast(&memberlistBroadcast{"bar", []byte("3. this is a test."), nil}) q.QueueBroadcast(&memberlistBroadcast{"baz", []byte("4. this is a test."), nil}) // 2 byte overhead per message, should get all 4 messages all := q.GetBroadcasts(2, 80) if len(all) != 4 { t.Fatalf("missing messages: %v", all) } // 3 byte overhead, should only get 3 messages back partial := q.GetBroadcasts(3, 80) if len(partial) != 3 { t.Fatalf("missing messages: %v", partial) } } func TestTransmitLimited_GetBroadcasts_Limit(t *testing.T) { q := &TransmitLimitedQueue{RetransmitMult: 1, NumNodes: func() int { return 10 }} // 18 bytes per message q.QueueBroadcast(&memberlistBroadcast{"test", []byte("1. this is a test."), nil}) q.QueueBroadcast(&memberlistBroadcast{"foo", []byte("2. this is a test."), nil}) q.QueueBroadcast(&memberlistBroadcast{"bar", []byte("3. this is a test."), nil}) q.QueueBroadcast(&memberlistBroadcast{"baz", []byte("4. this is a test."), nil}) // 3 byte overhead, should only get 3 messages back partial1 := q.GetBroadcasts(3, 80) if len(partial1) != 3 { t.Fatalf("missing messages: %v", partial1) } partial2 := q.GetBroadcasts(3, 80) if len(partial2) != 3 { t.Fatalf("missing messages: %v", partial2) } // Only two not expired partial3 := q.GetBroadcasts(3, 80) if len(partial3) != 2 { t.Fatalf("missing messages: %v", partial3) } // Should get nothing partial5 := q.GetBroadcasts(3, 80) if len(partial5) != 0 { t.Fatalf("missing messages: %v", partial5) } } func TestTransmitLimited_Prune(t *testing.T) { q := &TransmitLimitedQueue{RetransmitMult: 1, NumNodes: func() int { return 10 }} ch1 := make(chan struct{}, 1) ch2 := make(chan struct{}, 1) // 18 bytes per message q.QueueBroadcast(&memberlistBroadcast{"test", []byte("1. this is a test."), ch1}) q.QueueBroadcast(&memberlistBroadcast{"foo", []byte("2. this is a test."), ch2}) q.QueueBroadcast(&memberlistBroadcast{"bar", []byte("3. this is a test."), nil}) q.QueueBroadcast(&memberlistBroadcast{"baz", []byte("4. this is a test."), nil}) // Keep only 2 q.Prune(2) if q.NumQueued() != 2 { t.Fatalf("bad len") } // Should notify the first two select { case <-ch1: default: t.Fatalf("expected invalidation") } select { case <-ch2: default: t.Fatalf("expected invalidation") } if q.bcQueue[0].b.(*memberlistBroadcast).node != "bar" { t.Fatalf("missing bar") } if q.bcQueue[1].b.(*memberlistBroadcast).node != "baz" { t.Fatalf("missing baz") } } func TestLimitedBroadcastSort(t *testing.T) { bc := limitedBroadcasts([]*limitedBroadcast{ &limitedBroadcast{ transmits: 0, }, &limitedBroadcast{ transmits: 10, }, &limitedBroadcast{ transmits: 3, }, &limitedBroadcast{ transmits: 4, }, &limitedBroadcast{ transmits: 7, }, }) bc.Sort() if bc[0].transmits != 10 { t.Fatalf("bad val %v", bc[0]) } if bc[1].transmits != 7 { t.Fatalf("bad val %v", bc[7]) } if bc[2].transmits != 4 { t.Fatalf("bad val %v", bc[2]) } if bc[3].transmits != 3 { t.Fatalf("bad val %v", bc[3]) } if bc[4].transmits != 0 { t.Fatalf("bad val %v", bc[4]) } } memberlist-0.1.0/security.go000066400000000000000000000116501307374264600160700ustar00rootroot00000000000000package memberlist import ( "bytes" "crypto/aes" "crypto/cipher" "crypto/rand" "fmt" "io" ) /* Encrypted messages are prefixed with an encryptionVersion byte that is used for us to be able to properly encode/decode. We currently support the following versions: 0 - AES-GCM 128, using PKCS7 padding 1 - AES-GCM 128, no padding. Padding not needed, caused bloat. */ type encryptionVersion uint8 const ( minEncryptionVersion encryptionVersion = 0 maxEncryptionVersion encryptionVersion = 1 ) const ( versionSize = 1 nonceSize = 12 tagSize = 16 maxPadOverhead = 16 blockSize = aes.BlockSize ) // pkcs7encode is used to pad a byte buffer to a specific block size using // the PKCS7 algorithm. "Ignores" some bytes to compensate for IV func pkcs7encode(buf *bytes.Buffer, ignore, blockSize int) { n := buf.Len() - ignore more := blockSize - (n % blockSize) for i := 0; i < more; i++ { buf.WriteByte(byte(more)) } } // pkcs7decode is used to decode a buffer that has been padded func pkcs7decode(buf []byte, blockSize int) []byte { if len(buf) == 0 { panic("Cannot decode a PKCS7 buffer of zero length") } n := len(buf) last := buf[n-1] n -= int(last) return buf[:n] } // encryptOverhead returns the maximum possible overhead of encryption by version func encryptOverhead(vsn encryptionVersion) int { switch vsn { case 0: return 45 // Version: 1, IV: 12, Padding: 16, Tag: 16 case 1: return 29 // Version: 1, IV: 12, Tag: 16 default: panic("unsupported version") } } // encryptedLength is used to compute the buffer size needed // for a message of given length func encryptedLength(vsn encryptionVersion, inp int) int { // If we are on version 1, there is no padding if vsn >= 1 { return versionSize + nonceSize + inp + tagSize } // Determine the padding size padding := blockSize - (inp % blockSize) // Sum the extra parts to get total size return versionSize + nonceSize + inp + padding + tagSize } // encryptPayload is used to encrypt a message with a given key. // We make use of AES-128 in GCM mode. New byte buffer is the version, // nonce, ciphertext and tag func encryptPayload(vsn encryptionVersion, key []byte, msg []byte, data []byte, dst *bytes.Buffer) error { // Get the AES block cipher aesBlock, err := aes.NewCipher(key) if err != nil { return err } // Get the GCM cipher mode gcm, err := cipher.NewGCM(aesBlock) if err != nil { return err } // Grow the buffer to make room for everything offset := dst.Len() dst.Grow(encryptedLength(vsn, len(msg))) // Write the encryption version dst.WriteByte(byte(vsn)) // Add a random nonce io.CopyN(dst, rand.Reader, nonceSize) afterNonce := dst.Len() // Ensure we are correctly padded (only version 0) if vsn == 0 { io.Copy(dst, bytes.NewReader(msg)) pkcs7encode(dst, offset+versionSize+nonceSize, aes.BlockSize) } // Encrypt message using GCM slice := dst.Bytes()[offset:] nonce := slice[versionSize : versionSize+nonceSize] // Message source depends on the encryption version. // Version 0 uses padding, version 1 does not var src []byte if vsn == 0 { src = slice[versionSize+nonceSize:] } else { src = msg } out := gcm.Seal(nil, nonce, src, data) // Truncate the plaintext, and write the cipher text dst.Truncate(afterNonce) dst.Write(out) return nil } // decryptMessage performs the actual decryption of ciphertext. This is in its // own function to allow it to be called on all keys easily. func decryptMessage(key, msg []byte, data []byte) ([]byte, error) { // Get the AES block cipher aesBlock, err := aes.NewCipher(key) if err != nil { return nil, err } // Get the GCM cipher mode gcm, err := cipher.NewGCM(aesBlock) if err != nil { return nil, err } // Decrypt the message nonce := msg[versionSize : versionSize+nonceSize] ciphertext := msg[versionSize+nonceSize:] plain, err := gcm.Open(nil, nonce, ciphertext, data) if err != nil { return nil, err } // Success! return plain, nil } // decryptPayload is used to decrypt a message with a given key, // and verify it's contents. Any padding will be removed, and a // slice to the plaintext is returned. Decryption is done IN PLACE! func decryptPayload(keys [][]byte, msg []byte, data []byte) ([]byte, error) { // Ensure we have at least one byte if len(msg) == 0 { return nil, fmt.Errorf("Cannot decrypt empty payload") } // Verify the version vsn := encryptionVersion(msg[0]) if vsn > maxEncryptionVersion { return nil, fmt.Errorf("Unsupported encryption version %d", msg[0]) } // Ensure the length is sane if len(msg) < encryptedLength(vsn, 0) { return nil, fmt.Errorf("Payload is too small to decrypt: %d", len(msg)) } for _, key := range keys { plain, err := decryptMessage(key, msg, data) if err == nil { // Remove the PKCS7 padding for vsn 0 if vsn == 0 { return pkcs7decode(plain, aes.BlockSize), nil } else { return plain, nil } } } return nil, fmt.Errorf("No installed keys could decrypt the message") } memberlist-0.1.0/security_test.go000066400000000000000000000027451307374264600171340ustar00rootroot00000000000000package memberlist import ( "bytes" "reflect" "testing" ) func TestPKCS7(t *testing.T) { for i := 0; i <= 255; i++ { // Make a buffer of size i buf := []byte{} for j := 0; j < i; j++ { buf = append(buf, byte(i)) } // Copy to bytes buffer inp := bytes.NewBuffer(nil) inp.Write(buf) // Pad this out pkcs7encode(inp, 0, 16) // Unpad dec := pkcs7decode(inp.Bytes(), 16) // Ensure equivilence if !reflect.DeepEqual(buf, dec) { t.Fatalf("mismatch: %v %v", buf, dec) } } } func TestEncryptDecrypt_V0(t *testing.T) { encryptDecryptVersioned(0, t) } func TestEncryptDecrypt_V1(t *testing.T) { encryptDecryptVersioned(1, t) } func encryptDecryptVersioned(vsn encryptionVersion, t *testing.T) { k1 := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} plaintext := []byte("this is a plain text message") extra := []byte("random data") var buf bytes.Buffer err := encryptPayload(vsn, k1, plaintext, extra, &buf) if err != nil { t.Fatalf("err: %v", err) } expLen := encryptedLength(vsn, len(plaintext)) if buf.Len() != expLen { t.Fatalf("output length is unexpected %d %d %d", len(plaintext), buf.Len(), expLen) } msg, err := decryptPayload([][]byte{k1}, buf.Bytes(), extra) if err != nil { t.Fatalf("err: %v", err) } cmp := bytes.Compare(msg, plaintext) if cmp != 0 { t.Errorf("len %d %v", len(msg), msg) t.Errorf("len %d %v", len(plaintext), plaintext) t.Fatalf("encrypt/decrypt failed! %d '%s' '%s'", cmp, msg, plaintext) } } memberlist-0.1.0/state.go000066400000000000000000001011461307374264600153410ustar00rootroot00000000000000package memberlist import ( "bytes" "fmt" "math" "math/rand" "net" "sync/atomic" "time" "github.com/armon/go-metrics" ) type nodeStateType int const ( stateAlive nodeStateType = iota stateSuspect stateDead ) // Node represents a node in the cluster. type Node struct { Name string Addr net.IP Port uint16 Meta []byte // Metadata from the delegate for this node. PMin uint8 // Minimum protocol version this understands PMax uint8 // Maximum protocol version this understands PCur uint8 // Current version node is speaking DMin uint8 // Min protocol version for the delegate to understand DMax uint8 // Max protocol version for the delegate to understand DCur uint8 // Current version delegate is speaking } // Address returns the host:port form of a node's address, suitable for use // with a transport. func (n *Node) Address() string { return joinHostPort(n.Addr.String(), n.Port) } // NodeState is used to manage our state view of another node type nodeState struct { Node Incarnation uint32 // Last known incarnation number State nodeStateType // Current state StateChange time.Time // Time last state change happened } // Address returns the host:port form of a node's address, suitable for use // with a transport. func (n *nodeState) Address() string { return n.Node.Address() } // ackHandler is used to register handlers for incoming acks and nacks. type ackHandler struct { ackFn func([]byte, time.Time) nackFn func() timer *time.Timer } // NoPingResponseError is used to indicate a 'ping' packet was // successfully issued but no response was received type NoPingResponseError struct { node string } func (f NoPingResponseError) Error() string { return fmt.Sprintf("No response from node %s", f.node) } // Schedule is used to ensure the Tick is performed periodically. This // function is safe to call multiple times. If the memberlist is already // scheduled, then it won't do anything. func (m *Memberlist) schedule() { m.tickerLock.Lock() defer m.tickerLock.Unlock() // If we already have tickers, then don't do anything, since we're // scheduled if len(m.tickers) > 0 { return } // Create the stop tick channel, a blocking channel. We close this // when we should stop the tickers. stopCh := make(chan struct{}) // Create a new probeTicker if m.config.ProbeInterval > 0 { t := time.NewTicker(m.config.ProbeInterval) go m.triggerFunc(m.config.ProbeInterval, t.C, stopCh, m.probe) m.tickers = append(m.tickers, t) } // Create a push pull ticker if needed if m.config.PushPullInterval > 0 { go m.pushPullTrigger(stopCh) } // Create a gossip ticker if needed if m.config.GossipInterval > 0 && m.config.GossipNodes > 0 { t := time.NewTicker(m.config.GossipInterval) go m.triggerFunc(m.config.GossipInterval, t.C, stopCh, m.gossip) m.tickers = append(m.tickers, t) } // If we made any tickers, then record the stopTick channel for // later. if len(m.tickers) > 0 { m.stopTick = stopCh } } // triggerFunc is used to trigger a function call each time a // message is received until a stop tick arrives. func (m *Memberlist) triggerFunc(stagger time.Duration, C <-chan time.Time, stop <-chan struct{}, f func()) { // Use a random stagger to avoid syncronizing randStagger := time.Duration(uint64(rand.Int63()) % uint64(stagger)) select { case <-time.After(randStagger): case <-stop: return } for { select { case <-C: f() case <-stop: return } } } // pushPullTrigger is used to periodically trigger a push/pull until // a stop tick arrives. We don't use triggerFunc since the push/pull // timer is dynamically scaled based on cluster size to avoid network // saturation func (m *Memberlist) pushPullTrigger(stop <-chan struct{}) { interval := m.config.PushPullInterval // Use a random stagger to avoid syncronizing randStagger := time.Duration(uint64(rand.Int63()) % uint64(interval)) select { case <-time.After(randStagger): case <-stop: return } // Tick using a dynamic timer for { tickTime := pushPullScale(interval, m.estNumNodes()) select { case <-time.After(tickTime): m.pushPull() case <-stop: return } } } // Deschedule is used to stop the background maintenance. This is safe // to call multiple times. func (m *Memberlist) deschedule() { m.tickerLock.Lock() defer m.tickerLock.Unlock() // If we have no tickers, then we aren't scheduled. if len(m.tickers) == 0 { return } // Close the stop channel so all the ticker listeners stop. close(m.stopTick) // Explicitly stop all the tickers themselves so they don't take // up any more resources, and get rid of the list. for _, t := range m.tickers { t.Stop() } m.tickers = nil } // Tick is used to perform a single round of failure detection and gossip func (m *Memberlist) probe() { // Track the number of indexes we've considered probing numCheck := 0 START: m.nodeLock.RLock() // Make sure we don't wrap around infinitely if numCheck >= len(m.nodes) { m.nodeLock.RUnlock() return } // Handle the wrap around case if m.probeIndex >= len(m.nodes) { m.nodeLock.RUnlock() m.resetNodes() m.probeIndex = 0 numCheck++ goto START } // Determine if we should probe this node skip := false var node nodeState node = *m.nodes[m.probeIndex] if node.Name == m.config.Name { skip = true } else if node.State == stateDead { skip = true } // Potentially skip m.nodeLock.RUnlock() m.probeIndex++ if skip { numCheck++ goto START } // Probe the specific node m.probeNode(&node) } // probeNode handles a single round of failure checking on a node. func (m *Memberlist) probeNode(node *nodeState) { defer metrics.MeasureSince([]string{"memberlist", "probeNode"}, time.Now()) // We use our health awareness to scale the overall probe interval, so we // slow down if we detect problems. The ticker that calls us can handle // us running over the base interval, and will skip missed ticks. probeInterval := m.awareness.ScaleTimeout(m.config.ProbeInterval) if probeInterval > m.config.ProbeInterval { metrics.IncrCounter([]string{"memberlist", "degraded", "probe"}, 1) } // Prepare a ping message and setup an ack handler. ping := ping{SeqNo: m.nextSeqNo(), Node: node.Name} ackCh := make(chan ackMessage, m.config.IndirectChecks+1) nackCh := make(chan struct{}, m.config.IndirectChecks+1) m.setProbeChannels(ping.SeqNo, ackCh, nackCh, probeInterval) // Send a ping to the node. If this node looks like it's suspect or dead, // also tack on a suspect message so that it has a chance to refute as // soon as possible. deadline := time.Now().Add(probeInterval) addr := node.Address() if node.State == stateAlive { if err := m.encodeAndSendMsg(addr, pingMsg, &ping); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send ping: %s", err) return } } else { var msgs [][]byte if buf, err := encode(pingMsg, &ping); err != nil { m.logger.Printf("[ERR] memberlist: Failed to encode ping message: %s", err) return } else { msgs = append(msgs, buf.Bytes()) } s := suspect{Incarnation: node.Incarnation, Node: node.Name, From: m.config.Name} if buf, err := encode(suspectMsg, &s); err != nil { m.logger.Printf("[ERR] memberlist: Failed to encode suspect message: %s", err) return } else { msgs = append(msgs, buf.Bytes()) } compound := makeCompoundMessage(msgs) if err := m.rawSendMsgPacket(addr, &node.Node, compound.Bytes()); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send compound ping and suspect message to %s: %s", addr, err) return } } // Mark the sent time here, which should be after any pre-processing and // system calls to do the actual send. This probably under-reports a bit, // but it's the best we can do. sent := time.Now() // Arrange for our self-awareness to get updated. At this point we've // sent the ping, so any return statement means the probe succeeded // which will improve our health until we get to the failure scenarios // at the end of this function, which will alter this delta variable // accordingly. awarenessDelta := -1 defer func() { m.awareness.ApplyDelta(awarenessDelta) }() // Wait for response or round-trip-time. select { case v := <-ackCh: if v.Complete == true { if m.config.Ping != nil { rtt := v.Timestamp.Sub(sent) m.config.Ping.NotifyPingComplete(&node.Node, rtt, v.Payload) } return } // As an edge case, if we get a timeout, we need to re-enqueue it // here to break out of the select below. if v.Complete == false { ackCh <- v } case <-time.After(m.config.ProbeTimeout): // Note that we don't scale this timeout based on awareness and // the health score. That's because we don't really expect waiting // longer to help get UDP through. Since health does extend the // probe interval it will give the TCP fallback more time, which // is more active in dealing with lost packets, and it gives more // time to wait for indirect acks/nacks. m.logger.Printf("[DEBUG] memberlist: Failed ping: %v (timeout reached)", node.Name) } // Get some random live nodes. m.nodeLock.RLock() kNodes := kRandomNodes(m.config.IndirectChecks, m.nodes, func(n *nodeState) bool { return n.Name == m.config.Name || n.Name == node.Name || n.State != stateAlive }) m.nodeLock.RUnlock() // Attempt an indirect ping. expectedNacks := 0 ind := indirectPingReq{SeqNo: ping.SeqNo, Target: node.Addr, Port: node.Port, Node: node.Name} for _, peer := range kNodes { // We only expect nack to be sent from peers who understand // version 4 of the protocol. if ind.Nack = peer.PMax >= 4; ind.Nack { expectedNacks++ } if err := m.encodeAndSendMsg(peer.Address(), indirectPingMsg, &ind); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send indirect ping: %s", err) } } // Also make an attempt to contact the node directly over TCP. This // helps prevent confused clients who get isolated from UDP traffic // but can still speak TCP (which also means they can possibly report // misinformation to other nodes via anti-entropy), avoiding flapping in // the cluster. // // This is a little unusual because we will attempt a TCP ping to any // member who understands version 3 of the protocol, regardless of // which protocol version we are speaking. That's why we've included a // config option to turn this off if desired. fallbackCh := make(chan bool, 1) if (!m.config.DisableTcpPings) && (node.PMax >= 3) { go func() { defer close(fallbackCh) didContact, err := m.sendPingAndWaitForAck(node.Address(), ping, deadline) if err != nil { m.logger.Printf("[ERR] memberlist: Failed fallback ping: %s", err) } else { fallbackCh <- didContact } }() } else { close(fallbackCh) } // Wait for the acks or timeout. Note that we don't check the fallback // channel here because we want to issue a warning below if that's the // *only* way we hear back from the peer, so we have to let this time // out first to allow the normal UDP-based acks to come in. select { case v := <-ackCh: if v.Complete == true { return } } // Finally, poll the fallback channel. The timeouts are set such that // the channel will have something or be closed without having to wait // any additional time here. for didContact := range fallbackCh { if didContact { m.logger.Printf("[WARN] memberlist: Was able to connect to %s but other probes failed, network may be misconfigured", node.Name) return } } // Update our self-awareness based on the results of this failed probe. // If we don't have peers who will send nacks then we penalize for any // failed probe as a simple health metric. If we do have peers to nack // verify, then we can use that as a more sophisticated measure of self- // health because we assume them to be working, and they can help us // decide if the probed node was really dead or if it was something wrong // with ourselves. awarenessDelta = 0 if expectedNacks > 0 { if nackCount := len(nackCh); nackCount < expectedNacks { awarenessDelta += (expectedNacks - nackCount) } } else { awarenessDelta += 1 } // No acks received from target, suspect it as failed. m.logger.Printf("[INFO] memberlist: Suspect %s has failed, no acks received", node.Name) s := suspect{Incarnation: node.Incarnation, Node: node.Name, From: m.config.Name} m.suspectNode(&s) } // Ping initiates a ping to the node with the specified name. func (m *Memberlist) Ping(node string, addr net.Addr) (time.Duration, error) { // Prepare a ping message and setup an ack handler. ping := ping{SeqNo: m.nextSeqNo(), Node: node} ackCh := make(chan ackMessage, m.config.IndirectChecks+1) m.setProbeChannels(ping.SeqNo, ackCh, nil, m.config.ProbeInterval) // Send a ping to the node. if err := m.encodeAndSendMsg(addr.String(), pingMsg, &ping); err != nil { return 0, err } // Mark the sent time here, which should be after any pre-processing and // system calls to do the actual send. This probably under-reports a bit, // but it's the best we can do. sent := time.Now() // Wait for response or timeout. select { case v := <-ackCh: if v.Complete == true { return v.Timestamp.Sub(sent), nil } case <-time.After(m.config.ProbeTimeout): // Timeout, return an error below. } m.logger.Printf("[DEBUG] memberlist: Failed UDP ping: %v (timeout reached)", node) return 0, NoPingResponseError{ping.Node} } // resetNodes is used when the tick wraps around. It will reap the // dead nodes and shuffle the node list. func (m *Memberlist) resetNodes() { m.nodeLock.Lock() defer m.nodeLock.Unlock() // Move dead nodes, but respect gossip to the dead interval deadIdx := moveDeadNodes(m.nodes, m.config.GossipToTheDeadTime) // Deregister the dead nodes for i := deadIdx; i < len(m.nodes); i++ { delete(m.nodeMap, m.nodes[i].Name) m.nodes[i] = nil } // Trim the nodes to exclude the dead nodes m.nodes = m.nodes[0:deadIdx] // Update numNodes after we've trimmed the dead nodes atomic.StoreUint32(&m.numNodes, uint32(deadIdx)) // Shuffle live nodes shuffleNodes(m.nodes) } // gossip is invoked every GossipInterval period to broadcast our gossip // messages to a few random nodes. func (m *Memberlist) gossip() { defer metrics.MeasureSince([]string{"memberlist", "gossip"}, time.Now()) // Get some random live, suspect, or recently dead nodes m.nodeLock.RLock() kNodes := kRandomNodes(m.config.GossipNodes, m.nodes, func(n *nodeState) bool { if n.Name == m.config.Name { return true } switch n.State { case stateAlive, stateSuspect: return false case stateDead: return time.Since(n.StateChange) > m.config.GossipToTheDeadTime default: return true } }) m.nodeLock.RUnlock() // Compute the bytes available bytesAvail := m.config.UDPBufferSize - compoundHeaderOverhead if m.config.EncryptionEnabled() { bytesAvail -= encryptOverhead(m.encryptionVersion()) } for _, node := range kNodes { // Get any pending broadcasts msgs := m.getBroadcasts(compoundOverhead, bytesAvail) if len(msgs) == 0 { return } addr := node.Address() if len(msgs) == 1 { // Send single message as is if err := m.rawSendMsgPacket(addr, &node.Node, msgs[0]); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", addr, err) } } else { // Otherwise create and send a compound message compound := makeCompoundMessage(msgs) if err := m.rawSendMsgPacket(addr, &node.Node, compound.Bytes()); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", addr, err) } } } } // pushPull is invoked periodically to randomly perform a complete state // exchange. Used to ensure a high level of convergence, but is also // reasonably expensive as the entire state of this node is exchanged // with the other node. func (m *Memberlist) pushPull() { // Get a random live node m.nodeLock.RLock() nodes := kRandomNodes(1, m.nodes, func(n *nodeState) bool { return n.Name == m.config.Name || n.State != stateAlive }) m.nodeLock.RUnlock() // If no nodes, bail if len(nodes) == 0 { return } node := nodes[0] // Attempt a push pull if err := m.pushPullNode(node.Address(), false); err != nil { m.logger.Printf("[ERR] memberlist: Push/Pull with %s failed: %s", node.Name, err) } } // pushPullNode does a complete state exchange with a specific node. func (m *Memberlist) pushPullNode(addr string, join bool) error { defer metrics.MeasureSince([]string{"memberlist", "pushPullNode"}, time.Now()) // Attempt to send and receive with the node remote, userState, err := m.sendAndReceiveState(addr, join) if err != nil { return err } if err := m.mergeRemoteState(join, remote, userState); err != nil { return err } return nil } // verifyProtocol verifies that all the remote nodes can speak with our // nodes and vice versa on both the core protocol as well as the // delegate protocol level. // // The verification works by finding the maximum minimum and // minimum maximum understood protocol and delegate versions. In other words, // it finds the common denominator of protocol and delegate version ranges // for the entire cluster. // // After this, it goes through the entire cluster (local and remote) and // verifies that everyone's speaking protocol versions satisfy this range. // If this passes, it means that every node can understand each other. func (m *Memberlist) verifyProtocol(remote []pushNodeState) error { m.nodeLock.RLock() defer m.nodeLock.RUnlock() // Maximum minimum understood and minimum maximum understood for both // the protocol and delegate versions. We use this to verify everyone // can be understood. var maxpmin, minpmax uint8 var maxdmin, mindmax uint8 minpmax = math.MaxUint8 mindmax = math.MaxUint8 for _, rn := range remote { // If the node isn't alive, then skip it if rn.State != stateAlive { continue } // Skip nodes that don't have versions set, it just means // their version is zero. if len(rn.Vsn) == 0 { continue } if rn.Vsn[0] > maxpmin { maxpmin = rn.Vsn[0] } if rn.Vsn[1] < minpmax { minpmax = rn.Vsn[1] } if rn.Vsn[3] > maxdmin { maxdmin = rn.Vsn[3] } if rn.Vsn[4] < mindmax { mindmax = rn.Vsn[4] } } for _, n := range m.nodes { // Ignore non-alive nodes if n.State != stateAlive { continue } if n.PMin > maxpmin { maxpmin = n.PMin } if n.PMax < minpmax { minpmax = n.PMax } if n.DMin > maxdmin { maxdmin = n.DMin } if n.DMax < mindmax { mindmax = n.DMax } } // Now that we definitively know the minimum and maximum understood // version that satisfies the whole cluster, we verify that every // node in the cluster satisifies this. for _, n := range remote { var nPCur, nDCur uint8 if len(n.Vsn) > 0 { nPCur = n.Vsn[2] nDCur = n.Vsn[5] } if nPCur < maxpmin || nPCur > minpmax { return fmt.Errorf( "Node '%s' protocol version (%d) is incompatible: [%d, %d]", n.Name, nPCur, maxpmin, minpmax) } if nDCur < maxdmin || nDCur > mindmax { return fmt.Errorf( "Node '%s' delegate protocol version (%d) is incompatible: [%d, %d]", n.Name, nDCur, maxdmin, mindmax) } } for _, n := range m.nodes { nPCur := n.PCur nDCur := n.DCur if nPCur < maxpmin || nPCur > minpmax { return fmt.Errorf( "Node '%s' protocol version (%d) is incompatible: [%d, %d]", n.Name, nPCur, maxpmin, minpmax) } if nDCur < maxdmin || nDCur > mindmax { return fmt.Errorf( "Node '%s' delegate protocol version (%d) is incompatible: [%d, %d]", n.Name, nDCur, maxdmin, mindmax) } } return nil } // nextSeqNo returns a usable sequence number in a thread safe way func (m *Memberlist) nextSeqNo() uint32 { return atomic.AddUint32(&m.sequenceNum, 1) } // nextIncarnation returns the next incarnation number in a thread safe way func (m *Memberlist) nextIncarnation() uint32 { return atomic.AddUint32(&m.incarnation, 1) } // skipIncarnation adds the positive offset to the incarnation number. func (m *Memberlist) skipIncarnation(offset uint32) uint32 { return atomic.AddUint32(&m.incarnation, offset) } // estNumNodes is used to get the current estimate of the number of nodes func (m *Memberlist) estNumNodes() int { return int(atomic.LoadUint32(&m.numNodes)) } type ackMessage struct { Complete bool Payload []byte Timestamp time.Time } // setProbeChannels is used to attach the ackCh to receive a message when an ack // with a given sequence number is received. The `complete` field of the message // will be false on timeout. Any nack messages will cause an empty struct to be // passed to the nackCh, which can be nil if not needed. func (m *Memberlist) setProbeChannels(seqNo uint32, ackCh chan ackMessage, nackCh chan struct{}, timeout time.Duration) { // Create handler functions for acks and nacks ackFn := func(payload []byte, timestamp time.Time) { select { case ackCh <- ackMessage{true, payload, timestamp}: default: } } nackFn := func() { select { case nackCh <- struct{}{}: default: } } // Add the handlers ah := &ackHandler{ackFn, nackFn, nil} m.ackLock.Lock() m.ackHandlers[seqNo] = ah m.ackLock.Unlock() // Setup a reaping routing ah.timer = time.AfterFunc(timeout, func() { m.ackLock.Lock() delete(m.ackHandlers, seqNo) m.ackLock.Unlock() select { case ackCh <- ackMessage{false, nil, time.Now()}: default: } }) } // setAckHandler is used to attach a handler to be invoked when an ack with a // given sequence number is received. If a timeout is reached, the handler is // deleted. This is used for indirect pings so does not configure a function // for nacks. func (m *Memberlist) setAckHandler(seqNo uint32, ackFn func([]byte, time.Time), timeout time.Duration) { // Add the handler ah := &ackHandler{ackFn, nil, nil} m.ackLock.Lock() m.ackHandlers[seqNo] = ah m.ackLock.Unlock() // Setup a reaping routing ah.timer = time.AfterFunc(timeout, func() { m.ackLock.Lock() delete(m.ackHandlers, seqNo) m.ackLock.Unlock() }) } // Invokes an ack handler if any is associated, and reaps the handler immediately func (m *Memberlist) invokeAckHandler(ack ackResp, timestamp time.Time) { m.ackLock.Lock() ah, ok := m.ackHandlers[ack.SeqNo] delete(m.ackHandlers, ack.SeqNo) m.ackLock.Unlock() if !ok { return } ah.timer.Stop() ah.ackFn(ack.Payload, timestamp) } // Invokes nack handler if any is associated. func (m *Memberlist) invokeNackHandler(nack nackResp) { m.ackLock.Lock() ah, ok := m.ackHandlers[nack.SeqNo] m.ackLock.Unlock() if !ok || ah.nackFn == nil { return } ah.nackFn() } // refute gossips an alive message in response to incoming information that we // are suspect or dead. It will make sure the incarnation number beats the given // accusedInc value, or you can supply 0 to just get the next incarnation number. // This alters the node state that's passed in so this MUST be called while the // nodeLock is held. func (m *Memberlist) refute(me *nodeState, accusedInc uint32) { // Make sure the incarnation number beats the accusation. inc := m.nextIncarnation() if accusedInc >= inc { inc = m.skipIncarnation(accusedInc - inc + 1) } me.Incarnation = inc // Decrease our health because we are being asked to refute a problem. m.awareness.ApplyDelta(1) // Format and broadcast an alive message. a := alive{ Incarnation: inc, Node: me.Name, Addr: me.Addr, Port: me.Port, Meta: me.Meta, Vsn: []uint8{ me.PMin, me.PMax, me.PCur, me.DMin, me.DMax, me.DCur, }, } m.encodeAndBroadcast(me.Addr.String(), aliveMsg, a) } // aliveNode is invoked by the network layer when we get a message about a // live node. func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { m.nodeLock.Lock() defer m.nodeLock.Unlock() state, ok := m.nodeMap[a.Node] // It is possible that during a Leave(), there is already an aliveMsg // in-queue to be processed but blocked by the locks above. If we let // that aliveMsg process, it'll cause us to re-join the cluster. This // ensures that we don't. if m.leave && a.Node == m.config.Name { return } // Invoke the Alive delegate if any. This can be used to filter out // alive messages based on custom logic. For example, using a cluster name. // Using a merge delegate is not enough, as it is possible for passive // cluster merging to still occur. if m.config.Alive != nil { node := &Node{ Name: a.Node, Addr: a.Addr, Port: a.Port, Meta: a.Meta, PMin: a.Vsn[0], PMax: a.Vsn[1], PCur: a.Vsn[2], DMin: a.Vsn[3], DMax: a.Vsn[4], DCur: a.Vsn[5], } if err := m.config.Alive.NotifyAlive(node); err != nil { m.logger.Printf("[WARN] memberlist: ignoring alive message for '%s': %s", a.Node, err) return } } // Check if we've never seen this node before, and if not, then // store this node in our node map. if !ok { state = &nodeState{ Node: Node{ Name: a.Node, Addr: a.Addr, Port: a.Port, Meta: a.Meta, }, State: stateDead, } // Add to map m.nodeMap[a.Node] = state // Get a random offset. This is important to ensure // the failure detection bound is low on average. If all // nodes did an append, failure detection bound would be // very high. n := len(m.nodes) offset := randomOffset(n) // Add at the end and swap with the node at the offset m.nodes = append(m.nodes, state) m.nodes[offset], m.nodes[n] = m.nodes[n], m.nodes[offset] // Update numNodes after we've added a new node atomic.AddUint32(&m.numNodes, 1) } // Check if this address is different than the existing node if !bytes.Equal([]byte(state.Addr), a.Addr) || state.Port != a.Port { m.logger.Printf("[ERR] memberlist: Conflicting address for %s. Mine: %v:%d Theirs: %v:%d", state.Name, state.Addr, state.Port, net.IP(a.Addr), a.Port) // Inform the conflict delegate if provided if m.config.Conflict != nil { other := Node{ Name: a.Node, Addr: a.Addr, Port: a.Port, Meta: a.Meta, } m.config.Conflict.NotifyConflict(&state.Node, &other) } return } // Bail if the incarnation number is older, and this is not about us isLocalNode := state.Name == m.config.Name if a.Incarnation <= state.Incarnation && !isLocalNode { return } // Bail if strictly less and this is about us if a.Incarnation < state.Incarnation && isLocalNode { return } // Clear out any suspicion timer that may be in effect. delete(m.nodeTimers, a.Node) // Store the old state and meta data oldState := state.State oldMeta := state.Meta // If this is us we need to refute, otherwise re-broadcast if !bootstrap && isLocalNode { // Compute the version vector versions := []uint8{ state.PMin, state.PMax, state.PCur, state.DMin, state.DMax, state.DCur, } // If the Incarnation is the same, we need special handling, since it // possible for the following situation to happen: // 1) Start with configuration C, join cluster // 2) Hard fail / Kill / Shutdown // 3) Restart with configuration C', join cluster // // In this case, other nodes and the local node see the same incarnation, // but the values may not be the same. For this reason, we always // need to do an equality check for this Incarnation. In most cases, // we just ignore, but we may need to refute. // if a.Incarnation == state.Incarnation && bytes.Equal(a.Meta, state.Meta) && bytes.Equal(a.Vsn, versions) { return } m.refute(state, a.Incarnation) m.logger.Printf("[WARN] memberlist: Refuting an alive message") } else { m.encodeBroadcastNotify(a.Node, aliveMsg, a, notify) // Update protocol versions if it arrived if len(a.Vsn) > 0 { state.PMin = a.Vsn[0] state.PMax = a.Vsn[1] state.PCur = a.Vsn[2] state.DMin = a.Vsn[3] state.DMax = a.Vsn[4] state.DCur = a.Vsn[5] } // Update the state and incarnation number state.Incarnation = a.Incarnation state.Meta = a.Meta if state.State != stateAlive { state.State = stateAlive state.StateChange = time.Now() } } // Update metrics metrics.IncrCounter([]string{"memberlist", "msg", "alive"}, 1) // Notify the delegate of any relevant updates if m.config.Events != nil { if oldState == stateDead { // if Dead -> Alive, notify of join m.config.Events.NotifyJoin(&state.Node) } else if !bytes.Equal(oldMeta, state.Meta) { // if Meta changed, trigger an update notification m.config.Events.NotifyUpdate(&state.Node) } } } // suspectNode is invoked by the network layer when we get a message // about a suspect node func (m *Memberlist) suspectNode(s *suspect) { m.nodeLock.Lock() defer m.nodeLock.Unlock() state, ok := m.nodeMap[s.Node] // If we've never heard about this node before, ignore it if !ok { return } // Ignore old incarnation numbers if s.Incarnation < state.Incarnation { return } // See if there's a suspicion timer we can confirm. If the info is new // to us we will go ahead and re-gossip it. This allows for multiple // independent confirmations to flow even when a node probes a node // that's already suspect. if timer, ok := m.nodeTimers[s.Node]; ok { if timer.Confirm(s.From) { m.encodeAndBroadcast(s.Node, suspectMsg, s) } return } // Ignore non-alive nodes if state.State != stateAlive { return } // If this is us we need to refute, otherwise re-broadcast if state.Name == m.config.Name { m.refute(state, s.Incarnation) m.logger.Printf("[WARN] memberlist: Refuting a suspect message (from: %s)", s.From) return // Do not mark ourself suspect } else { m.encodeAndBroadcast(s.Node, suspectMsg, s) } // Update metrics metrics.IncrCounter([]string{"memberlist", "msg", "suspect"}, 1) // Update the state state.Incarnation = s.Incarnation state.State = stateSuspect changeTime := time.Now() state.StateChange = changeTime // Setup a suspicion timer. Given that we don't have any known phase // relationship with our peers, we set up k such that we hit the nominal // timeout two probe intervals short of what we expect given the suspicion // multiplier. k := m.config.SuspicionMult - 2 // If there aren't enough nodes to give the expected confirmations, just // set k to 0 to say that we don't expect any. Note we subtract 2 from n // here to take out ourselves and the node being probed. n := m.estNumNodes() if n-2 < k { k = 0 } // Compute the timeouts based on the size of the cluster. min := suspicionTimeout(m.config.SuspicionMult, n, m.config.ProbeInterval) max := time.Duration(m.config.SuspicionMaxTimeoutMult) * min fn := func(numConfirmations int) { m.nodeLock.Lock() state, ok := m.nodeMap[s.Node] timeout := ok && state.State == stateSuspect && state.StateChange == changeTime m.nodeLock.Unlock() if timeout { if k > 0 && numConfirmations < k { metrics.IncrCounter([]string{"memberlist", "degraded", "timeout"}, 1) } m.logger.Printf("[INFO] memberlist: Marking %s as failed, suspect timeout reached (%d peer confirmations)", state.Name, numConfirmations) d := dead{Incarnation: state.Incarnation, Node: state.Name, From: m.config.Name} m.deadNode(&d) } } m.nodeTimers[s.Node] = newSuspicion(s.From, k, min, max, fn) } // deadNode is invoked by the network layer when we get a message // about a dead node func (m *Memberlist) deadNode(d *dead) { m.nodeLock.Lock() defer m.nodeLock.Unlock() state, ok := m.nodeMap[d.Node] // If we've never heard about this node before, ignore it if !ok { return } // Ignore old incarnation numbers if d.Incarnation < state.Incarnation { return } // Clear out any suspicion timer that may be in effect. delete(m.nodeTimers, d.Node) // Ignore if node is already dead if state.State == stateDead { return } // Check if this is us if state.Name == m.config.Name { // If we are not leaving we need to refute if !m.leave { m.refute(state, d.Incarnation) m.logger.Printf("[WARN] memberlist: Refuting a dead message (from: %s)", d.From) return // Do not mark ourself dead } // If we are leaving, we broadcast and wait m.encodeBroadcastNotify(d.Node, deadMsg, d, m.leaveBroadcast) } else { m.encodeAndBroadcast(d.Node, deadMsg, d) } // Update metrics metrics.IncrCounter([]string{"memberlist", "msg", "dead"}, 1) // Update the state state.Incarnation = d.Incarnation state.State = stateDead state.StateChange = time.Now() // Notify of death if m.config.Events != nil { m.config.Events.NotifyLeave(&state.Node) } } // mergeState is invoked by the network layer when we get a Push/Pull // state transfer func (m *Memberlist) mergeState(remote []pushNodeState) { for _, r := range remote { switch r.State { case stateAlive: a := alive{ Incarnation: r.Incarnation, Node: r.Name, Addr: r.Addr, Port: r.Port, Meta: r.Meta, Vsn: r.Vsn, } m.aliveNode(&a, nil, false) case stateDead: // If the remote node believes a node is dead, we prefer to // suspect that node instead of declaring it dead instantly fallthrough case stateSuspect: s := suspect{Incarnation: r.Incarnation, Node: r.Name, From: m.config.Name} m.suspectNode(&s) } } } memberlist-0.1.0/state_test.go000066400000000000000000001361601307374264600164040ustar00rootroot00000000000000package memberlist import ( "bytes" "fmt" "net" "testing" "time" ) func HostMemberlist(host string, t *testing.T, f func(*Config)) *Memberlist { c := DefaultLANConfig() c.Name = host c.BindAddr = host if f != nil { f(c) } m, err := newMemberlist(c) if err != nil { t.Fatalf("failed to get memberlist: %s", err) } return m } func TestMemberList_Probe(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = time.Millisecond c.ProbeInterval = 10 * time.Millisecond }) m2 := HostMemberlist(addr2.String(), t, nil) a1 := alive{ Node: addr1.String(), Addr: []byte(addr1), Port: uint16(m1.config.BindPort), Incarnation: 1, } m1.aliveNode(&a1, nil, true) a2 := alive{ Node: addr2.String(), Addr: []byte(addr2), Port: uint16(m2.config.BindPort), Incarnation: 1, } m1.aliveNode(&a2, nil, false) // should ping addr2 m1.probe() // Should not be marked suspect n := m1.nodeMap[addr2.String()] if n.State != stateAlive { t.Fatalf("Expect node to be alive") } // Should increment seqno if m1.sequenceNum != 1 { t.Fatalf("bad seqno %v", m2.sequenceNum) } } func TestMemberList_ProbeNode_Suspect(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() addr3 := getBindAddr() addr4 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) ip3 := []byte(addr3) ip4 := []byte(addr4) m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = time.Millisecond c.ProbeInterval = 10 * time.Millisecond }) m2 := HostMemberlist(addr2.String(), t, nil) m3 := HostMemberlist(addr3.String(), t, nil) a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a2, nil, false) a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1} m1.aliveNode(&a3, nil, false) a4 := alive{Node: addr4.String(), Addr: ip4, Port: 7946, Incarnation: 1} m1.aliveNode(&a4, nil, false) n := m1.nodeMap[addr4.String()] m1.probeNode(n) // Should be marked suspect. if n.State != stateSuspect { t.Fatalf("Expect node to be suspect") } time.Sleep(10 * time.Millisecond) // One of the peers should have attempted an indirect probe. if m2.sequenceNum != 1 && m3.sequenceNum != 1 { t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) } } func TestMemberList_ProbeNode_Suspect_Dogpile(t *testing.T) { cases := []struct { numPeers int confirmations int expected time.Duration }{ {1, 0, 500 * time.Millisecond}, // n=2, k=3 (max timeout disabled) {2, 0, 500 * time.Millisecond}, // n=3, k=3 {3, 0, 500 * time.Millisecond}, // n=4, k=3 {4, 0, 1000 * time.Millisecond}, // n=5, k=3 (max timeout starts to take effect) {5, 0, 1000 * time.Millisecond}, // n=6, k=3 {5, 1, 750 * time.Millisecond}, // n=6, k=3 (confirmations start to lower timeout) {5, 2, 604 * time.Millisecond}, // n=6, k=3 {5, 3, 500 * time.Millisecond}, // n=6, k=3 (timeout driven to nominal value) {5, 4, 500 * time.Millisecond}, // n=6, k=3 } for i, c := range cases { // Create the main memberlist under test. addr := getBindAddr() m := HostMemberlist(addr.String(), t, func(c *Config) { c.ProbeTimeout = time.Millisecond c.ProbeInterval = 100 * time.Millisecond c.SuspicionMult = 5 c.SuspicionMaxTimeoutMult = 2 }) a := alive{Node: addr.String(), Addr: []byte(addr), Port: 7946, Incarnation: 1} m.aliveNode(&a, nil, true) // Make all but one peer be an real, alive instance. var peers []*Memberlist for j := 0; j < c.numPeers-1; j++ { peerAddr := getBindAddr() peers = append(peers, HostMemberlist(peerAddr.String(), t, nil)) a = alive{Node: peerAddr.String(), Addr: []byte(peerAddr), Port: 7946, Incarnation: 1} m.aliveNode(&a, nil, false) } // Just use a bogus address for the last peer so it doesn't respond // to pings, but tell the memberlist it's alive. badPeerAddr := getBindAddr() a = alive{Node: badPeerAddr.String(), Addr: []byte(badPeerAddr), Port: 7946, Incarnation: 1} m.aliveNode(&a, nil, false) // Force a probe, which should start us into the suspect state. n := m.nodeMap[badPeerAddr.String()] m.probeNode(n) if n.State != stateSuspect { t.Fatalf("case %d: expected node to be suspect", i) } // Add the requested number of confirmations. for j := 0; j < c.confirmations; j++ { from := fmt.Sprintf("peer%d", j) s := suspect{Node: badPeerAddr.String(), Incarnation: 1, From: from} m.suspectNode(&s) } // Wait until right before the timeout and make sure the timer // hasn't fired. fudge := 25 * time.Millisecond time.Sleep(c.expected - fudge) if n.State != stateSuspect { t.Fatalf("case %d: expected node to still be suspect", i) } // Wait through the timeout and a little after to make sure the // timer fires. time.Sleep(2 * fudge) if n.State != stateDead { t.Fatalf("case %d: expected node to be dead", i) } } } /* func TestMemberList_ProbeNode_FallbackTCP(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() addr3 := getBindAddr() addr4 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) ip3 := []byte(addr3) ip4 := []byte(addr4) var probeTimeMax time.Duration m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = 10 * time.Millisecond c.ProbeInterval = 200 * time.Millisecond probeTimeMax = c.ProbeInterval + 20*time.Millisecond }) defer m1.Shutdown() m2 := HostMemberlist(addr2.String(), t, nil) defer m2.Shutdown() m3 := HostMemberlist(addr3.String(), t, nil) defer m3.Shutdown() m4 := HostMemberlist(addr4.String(), t, nil) defer m4.Shutdown() a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a2, nil, false) a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1} m1.aliveNode(&a3, nil, false) // Make sure m4 is configured with the same protocol version as m1 so // the TCP fallback behavior is enabled. a4 := alive{ Node: addr4.String(), Addr: ip4, Port: 7946, Incarnation: 1, Vsn: []uint8{ ProtocolVersionMin, ProtocolVersionMax, m1.config.ProtocolVersion, m1.config.DelegateProtocolMin, m1.config.DelegateProtocolMax, m1.config.DelegateProtocolVersion, }, } m1.aliveNode(&a4, nil, false) // Isolate m4 from UDP traffic by re-opening its listener on the wrong // port. This should force the TCP fallback path to be used. var err error if err = m4.udpListener.Close(); err != nil { t.Fatalf("err: %v", err) } udpAddr := &net.UDPAddr{IP: ip4, Port: 9999} if m4.udpListener, err = net.ListenUDP("udp", udpAddr); err != nil { t.Fatalf("err: %v", err) } // Have node m1 probe m4. n := m1.nodeMap[addr4.String()] startProbe := time.Now() m1.probeNode(n) probeTime := time.Now().Sub(startProbe) // Should be marked alive because of the TCP fallback ping. if n.State != stateAlive { t.Fatalf("expect node to be alive") } // Make sure TCP activity completed in a timely manner. if probeTime > probeTimeMax { t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) } // Confirm at least one of the peers attempted an indirect probe. time.Sleep(probeTimeMax) if m2.sequenceNum != 1 && m3.sequenceNum != 1 { t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) } // Now shutdown all inbound TCP traffic to make sure the TCP fallback // path properly fails when the node is really unreachable. if err = m4.tcpListener.Close(); err != nil { t.Fatalf("err: %v", err) } tcpAddr := &net.TCPAddr{IP: ip4, Port: 9999} if m4.tcpListener, err = net.ListenTCP("tcp", tcpAddr); err != nil { t.Fatalf("err: %v", err) } // Probe again, this time there should be no contact. startProbe = time.Now() m1.probeNode(n) probeTime = time.Now().Sub(startProbe) // Node should be reported suspect. if n.State != stateSuspect { t.Fatalf("expect node to be suspect") } // Make sure TCP activity didn't cause us to wait too long before // timing out. if probeTime > probeTimeMax { t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) } // Confirm at least one of the peers attempted an indirect probe. time.Sleep(probeTimeMax) if m2.sequenceNum != 2 && m3.sequenceNum != 2 { t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) } } func TestMemberList_ProbeNode_FallbackTCP_Disabled(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() addr3 := getBindAddr() addr4 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) ip3 := []byte(addr3) ip4 := []byte(addr4) var probeTimeMax time.Duration m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = 10 * time.Millisecond c.ProbeInterval = 200 * time.Millisecond probeTimeMax = c.ProbeInterval + 20*time.Millisecond }) defer m1.Shutdown() m2 := HostMemberlist(addr2.String(), t, nil) defer m2.Shutdown() m3 := HostMemberlist(addr3.String(), t, nil) defer m3.Shutdown() m4 := HostMemberlist(addr4.String(), t, nil) defer m4.Shutdown() a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a2, nil, false) a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1} m1.aliveNode(&a3, nil, false) // Make sure m4 is configured with the same protocol version as m1 so // the TCP fallback behavior is enabled. a4 := alive{ Node: addr4.String(), Addr: ip4, Port: 7946, Incarnation: 1, Vsn: []uint8{ ProtocolVersionMin, ProtocolVersionMax, m1.config.ProtocolVersion, m1.config.DelegateProtocolMin, m1.config.DelegateProtocolMax, m1.config.DelegateProtocolVersion, }, } m1.aliveNode(&a4, nil, false) // Isolate m4 from UDP traffic by re-opening its listener on the wrong // port. This should force the TCP fallback path to be used. var err error if err = m4.udpListener.Close(); err != nil { t.Fatalf("err: %v", err) } udpAddr := &net.UDPAddr{IP: ip4, Port: 9999} if m4.udpListener, err = net.ListenUDP("udp", udpAddr); err != nil { t.Fatalf("err: %v", err) } // Disable the TCP pings using the config mechanism. m1.config.DisableTcpPings = true // Have node m1 probe m4. n := m1.nodeMap[addr4.String()] startProbe := time.Now() m1.probeNode(n) probeTime := time.Now().Sub(startProbe) // Node should be reported suspect. if n.State != stateSuspect { t.Fatalf("expect node to be suspect") } // Make sure TCP activity didn't cause us to wait too long before // timing out. if probeTime > probeTimeMax { t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) } // Confirm at least one of the peers attempted an indirect probe. time.Sleep(probeTimeMax) if m2.sequenceNum != 1 && m3.sequenceNum != 1 { t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) } } func TestMemberList_ProbeNode_FallbackTCP_OldProtocol(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() addr3 := getBindAddr() addr4 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) ip3 := []byte(addr3) ip4 := []byte(addr4) var probeTimeMax time.Duration m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = 10 * time.Millisecond c.ProbeInterval = 200 * time.Millisecond probeTimeMax = c.ProbeInterval + 20*time.Millisecond }) defer m1.Shutdown() m2 := HostMemberlist(addr2.String(), t, nil) defer m2.Shutdown() m3 := HostMemberlist(addr3.String(), t, nil) defer m3.Shutdown() m4 := HostMemberlist(addr4.String(), t, nil) defer m4.Shutdown() a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a2, nil, false) a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1} m1.aliveNode(&a3, nil, false) // Set up m4 so that it doesn't understand a version of the protocol // that supports TCP pings. a4 := alive{ Node: addr4.String(), Addr: ip4, Port: 7946, Incarnation: 1, Vsn: []uint8{ ProtocolVersionMin, ProtocolVersion2Compatible, ProtocolVersion2Compatible, m1.config.DelegateProtocolMin, m1.config.DelegateProtocolMax, m1.config.DelegateProtocolVersion, }, } m1.aliveNode(&a4, nil, false) // Isolate m4 from UDP traffic by re-opening its listener on the wrong // port. This should force the TCP fallback path to be used. var err error if err = m4.udpListener.Close(); err != nil { t.Fatalf("err: %v", err) } udpAddr := &net.UDPAddr{IP: ip4, Port: 9999} if m4.udpListener, err = net.ListenUDP("udp", udpAddr); err != nil { t.Fatalf("err: %v", err) } // Have node m1 probe m4. n := m1.nodeMap[addr4.String()] startProbe := time.Now() m1.probeNode(n) probeTime := time.Now().Sub(startProbe) // Node should be reported suspect. if n.State != stateSuspect { t.Fatalf("expect node to be suspect") } // Make sure TCP activity didn't cause us to wait too long before // timing out. if probeTime > probeTimeMax { t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) } // Confirm at least one of the peers attempted an indirect probe. time.Sleep(probeTimeMax) if m2.sequenceNum != 1 && m3.sequenceNum != 1 { t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) } } */ func TestMemberList_ProbeNode_Awareness_Degraded(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() addr3 := getBindAddr() addr4 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) ip3 := []byte(addr3) ip4 := []byte(addr4) var probeTimeMin time.Duration m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = 10 * time.Millisecond c.ProbeInterval = 200 * time.Millisecond probeTimeMin = 2*c.ProbeInterval - 50*time.Millisecond }) defer m1.Shutdown() m2 := HostMemberlist(addr2.String(), t, func(c *Config) { c.ProbeTimeout = 10 * time.Millisecond c.ProbeInterval = 200 * time.Millisecond }) defer m2.Shutdown() m3 := HostMemberlist(addr3.String(), t, func(c *Config) { c.ProbeTimeout = 10 * time.Millisecond c.ProbeInterval = 200 * time.Millisecond }) defer m3.Shutdown() // This will enable nacks by invoking the latest protocol version. vsn := []uint8{ ProtocolVersionMin, ProtocolVersionMax, m1.config.ProtocolVersion, m1.config.DelegateProtocolMin, m1.config.DelegateProtocolMax, m1.config.DelegateProtocolVersion, } a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1, Vsn: vsn} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1, Vsn: vsn} m1.aliveNode(&a2, nil, false) a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1, Vsn: vsn} m1.aliveNode(&a3, nil, false) // Node 4 never gets started. a4 := alive{Node: addr4.String(), Addr: ip4, Port: 7946, Incarnation: 1, Vsn: vsn} m1.aliveNode(&a4, nil, false) // Start the health in a degraded state. m1.awareness.ApplyDelta(1) if score := m1.GetHealthScore(); score != 1 { t.Fatalf("bad: %d", score) } // Have node m1 probe m4. n := m1.nodeMap[addr4.String()] startProbe := time.Now() m1.probeNode(n) probeTime := time.Now().Sub(startProbe) // Node should be reported suspect. if n.State != stateSuspect { t.Fatalf("expect node to be suspect") } // Make sure we timed out approximately on time (note that we accounted // for the slowed-down failure detector in the probeTimeMin calculation. if probeTime < probeTimeMin { t.Fatalf("probed too quickly, %9.6f", probeTime.Seconds()) } // Confirm at least one of the peers attempted an indirect probe. if m2.sequenceNum != 1 && m3.sequenceNum != 1 { t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) } // We should have gotten all the nacks, so our score should remain the // same, since we didn't get a successful probe. if score := m1.GetHealthScore(); score != 1 { t.Fatalf("bad: %d", score) } } func TestMemberList_ProbeNode_Awareness_Improved(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = 10 * time.Millisecond c.ProbeInterval = 200 * time.Millisecond }) defer m1.Shutdown() m2 := HostMemberlist(addr2.String(), t, nil) defer m2.Shutdown() a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a2, nil, false) // Start the health in a degraded state. m1.awareness.ApplyDelta(1) if score := m1.GetHealthScore(); score != 1 { t.Fatalf("bad: %d", score) } // Have node m1 probe m2. n := m1.nodeMap[addr2.String()] m1.probeNode(n) // Node should be reported alive. if n.State != stateAlive { t.Fatalf("expect node to be suspect") } // Our score should have improved since we did a good probe. if score := m1.GetHealthScore(); score != 0 { t.Fatalf("bad: %d", score) } } func TestMemberList_ProbeNode_Awareness_MissedNack(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() addr3 := getBindAddr() addr4 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) ip3 := []byte(addr3) ip4 := []byte(addr4) var probeTimeMax time.Duration m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = 10 * time.Millisecond c.ProbeInterval = 200 * time.Millisecond probeTimeMax = c.ProbeInterval + 50*time.Millisecond }) defer m1.Shutdown() m2 := HostMemberlist(addr2.String(), t, func(c *Config) { c.ProbeTimeout = 10 * time.Millisecond c.ProbeInterval = 200 * time.Millisecond }) defer m2.Shutdown() // This will enable nacks by invoking the latest protocol version. vsn := []uint8{ ProtocolVersionMin, ProtocolVersionMax, m1.config.ProtocolVersion, m1.config.DelegateProtocolMin, m1.config.DelegateProtocolMax, m1.config.DelegateProtocolVersion, } a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1, Vsn: vsn} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1, Vsn: vsn} m1.aliveNode(&a2, nil, false) // Node 3 and node 4 never get started. a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1, Vsn: vsn} m1.aliveNode(&a3, nil, false) a4 := alive{Node: addr4.String(), Addr: ip4, Port: 7946, Incarnation: 1, Vsn: vsn} m1.aliveNode(&a4, nil, false) // Make sure health looks good. if score := m1.GetHealthScore(); score != 0 { t.Fatalf("bad: %d", score) } // Have node m1 probe m4. n := m1.nodeMap[addr4.String()] startProbe := time.Now() m1.probeNode(n) probeTime := time.Now().Sub(startProbe) // Node should be reported suspect. if n.State != stateSuspect { t.Fatalf("expect node to be suspect") } // Make sure we timed out approximately on time. if probeTime > probeTimeMax { t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) } // We should have gotten dinged for the missed nack. time.Sleep(probeTimeMax) if score := m1.GetHealthScore(); score != 2 { t.Fatalf("bad: %d", score) } } func TestMemberList_ProbeNode_Awareness_OldProtocol(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() addr3 := getBindAddr() addr4 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) ip3 := []byte(addr3) ip4 := []byte(addr4) var probeTimeMax time.Duration m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = 10 * time.Millisecond c.ProbeInterval = 200 * time.Millisecond probeTimeMax = c.ProbeInterval + 20*time.Millisecond }) defer m1.Shutdown() m2 := HostMemberlist(addr2.String(), t, nil) defer m2.Shutdown() m3 := HostMemberlist(addr3.String(), t, nil) defer m3.Shutdown() a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a2, nil, false) a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1} m1.aliveNode(&a3, nil, false) // Node 4 never gets started. a4 := alive{Node: addr4.String(), Addr: ip4, Port: 7946, Incarnation: 1} m1.aliveNode(&a4, nil, false) // Make sure health looks good. if score := m1.GetHealthScore(); score != 0 { t.Fatalf("bad: %d", score) } // Have node m1 probe m4. n := m1.nodeMap[addr4.String()] startProbe := time.Now() m1.probeNode(n) probeTime := time.Now().Sub(startProbe) // Node should be reported suspect. if n.State != stateSuspect { t.Fatalf("expect node to be suspect") } // Make sure we timed out approximately on time. if probeTime > probeTimeMax { t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) } // Confirm at least one of the peers attempted an indirect probe. time.Sleep(probeTimeMax) if m2.sequenceNum != 1 && m3.sequenceNum != 1 { t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) } // Since we are using the old protocol here, we should have gotten dinged // for a failed health check. if score := m1.GetHealthScore(); score != 1 { t.Fatalf("bad: %d", score) } } func TestMemberList_ProbeNode_Buddy(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = time.Millisecond c.ProbeInterval = 10 * time.Millisecond }) m2 := HostMemberlist(addr2.String(), t, nil) a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) m1.aliveNode(&a2, nil, false) m2.aliveNode(&a2, nil, true) // Force the state to suspect so we piggyback a suspect message with the ping. // We should see this get refuted later, and the ping will succeed. n := m1.nodeMap[addr2.String()] n.State = stateSuspect m1.probeNode(n) // Make sure a ping was sent. if m1.sequenceNum != 1 { t.Fatalf("bad seqno %v", m1.sequenceNum) } // Check a broadcast is queued. if num := m2.broadcasts.NumQueued(); num != 1 { t.Fatalf("expected only one queued message: %d", num) } // Should be alive msg. if messageType(m2.broadcasts.bcQueue[0].b.Message()[0]) != aliveMsg { t.Fatalf("expected queued alive msg") } } func TestMemberList_ProbeNode(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = time.Millisecond c.ProbeInterval = 10 * time.Millisecond }) _ = HostMemberlist(addr2.String(), t, nil) a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a2, nil, false) n := m1.nodeMap[addr2.String()] m1.probeNode(n) // Should be marked alive if n.State != stateAlive { t.Fatalf("Expect node to be alive") } // Should increment seqno if m1.sequenceNum != 1 { t.Fatalf("bad seqno %v", m1.sequenceNum) } } func TestMemberList_Ping(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.ProbeTimeout = time.Millisecond c.ProbeInterval = 10 * time.Second }) _ = HostMemberlist(addr2.String(), t, nil) a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a2, nil, false) // Do a legit ping. n := m1.nodeMap[addr2.String()] addr, err := net.ResolveUDPAddr("udp", net.JoinHostPort(addr2.String(), "7946")) if err != nil { t.Fatalf("err: %v", err) } rtt, err := m1.Ping(n.Name, addr) if err != nil { t.Fatalf("err: %v", err) } if !(rtt > 0) { t.Fatalf("bad: %v", rtt) } // This ping has a bad node name so should timeout. _, err = m1.Ping("bad", addr) if _, ok := err.(NoPingResponseError); !ok || err == nil { t.Fatalf("bad: %v", err) } } func TestMemberList_ResetNodes(t *testing.T) { m := GetMemberlist(t) a1 := alive{Node: "test1", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a1, nil, false) a2 := alive{Node: "test2", Addr: []byte{127, 0, 0, 2}, Incarnation: 1} m.aliveNode(&a2, nil, false) a3 := alive{Node: "test3", Addr: []byte{127, 0, 0, 3}, Incarnation: 1} m.aliveNode(&a3, nil, false) d := dead{Node: "test2", Incarnation: 1} m.deadNode(&d) m.config.GossipToTheDeadTime = 100 * time.Millisecond m.resetNodes() if len(m.nodes) != 3 { t.Fatalf("Bad length") } if _, ok := m.nodeMap["test2"]; !ok { t.Fatalf("test2 should not be unmapped") } time.Sleep(200 * time.Millisecond) m.resetNodes() if len(m.nodes) != 2 { t.Fatalf("Bad length") } if _, ok := m.nodeMap["test2"]; ok { t.Fatalf("test2 should be unmapped") } } func TestMemberList_NextSeq(t *testing.T) { m := &Memberlist{} if m.nextSeqNo() != 1 { t.Fatalf("bad sequence no") } if m.nextSeqNo() != 2 { t.Fatalf("bad sequence no") } } func TestMemberList_setProbeChannels(t *testing.T) { m := &Memberlist{ackHandlers: make(map[uint32]*ackHandler)} ch := make(chan ackMessage, 1) m.setProbeChannels(0, ch, nil, 10*time.Millisecond) if _, ok := m.ackHandlers[0]; !ok { t.Fatalf("missing handler") } time.Sleep(20 * time.Millisecond) if _, ok := m.ackHandlers[0]; ok { t.Fatalf("non-reaped handler") } } func TestMemberList_setAckHandler(t *testing.T) { m := &Memberlist{ackHandlers: make(map[uint32]*ackHandler)} f := func([]byte, time.Time) {} m.setAckHandler(0, f, 10*time.Millisecond) if _, ok := m.ackHandlers[0]; !ok { t.Fatalf("missing handler") } time.Sleep(20 * time.Millisecond) if _, ok := m.ackHandlers[0]; ok { t.Fatalf("non-reaped handler") } } func TestMemberList_invokeAckHandler(t *testing.T) { m := &Memberlist{ackHandlers: make(map[uint32]*ackHandler)} // Does nothing m.invokeAckHandler(ackResp{}, time.Now()) var b bool f := func(payload []byte, timestamp time.Time) { b = true } m.setAckHandler(0, f, 10*time.Millisecond) // Should set b m.invokeAckHandler(ackResp{0, nil}, time.Now()) if !b { t.Fatalf("b not set") } if _, ok := m.ackHandlers[0]; ok { t.Fatalf("non-reaped handler") } } func TestMemberList_invokeAckHandler_Channel_Ack(t *testing.T) { m := &Memberlist{ackHandlers: make(map[uint32]*ackHandler)} ack := ackResp{0, []byte{0, 0, 0}} // Does nothing m.invokeAckHandler(ack, time.Now()) ackCh := make(chan ackMessage, 1) nackCh := make(chan struct{}, 1) m.setProbeChannels(0, ackCh, nackCh, 10*time.Millisecond) // Should send message m.invokeAckHandler(ack, time.Now()) select { case v := <-ackCh: if v.Complete != true { t.Fatalf("Bad value") } if bytes.Compare(v.Payload, ack.Payload) != 0 { t.Fatalf("wrong payload. expected: %v; actual: %v", ack.Payload, v.Payload) } case <-nackCh: t.Fatalf("should not get a nack") default: t.Fatalf("message not sent") } if _, ok := m.ackHandlers[0]; ok { t.Fatalf("non-reaped handler") } } func TestMemberList_invokeAckHandler_Channel_Nack(t *testing.T) { m := &Memberlist{ackHandlers: make(map[uint32]*ackHandler)} nack := nackResp{0} // Does nothing. m.invokeNackHandler(nack) ackCh := make(chan ackMessage, 1) nackCh := make(chan struct{}, 1) m.setProbeChannels(0, ackCh, nackCh, 10*time.Millisecond) // Should send message. m.invokeNackHandler(nack) select { case <-ackCh: t.Fatalf("should not get an ack") case <-nackCh: // Good. default: t.Fatalf("message not sent") } // Getting a nack doesn't reap the handler so that we can still forward // an ack up to the reap time, if we get one. if _, ok := m.ackHandlers[0]; !ok { t.Fatalf("handler should not be reaped") } ack := ackResp{0, []byte{0, 0, 0}} m.invokeAckHandler(ack, time.Now()) select { case v := <-ackCh: if v.Complete != true { t.Fatalf("Bad value") } if bytes.Compare(v.Payload, ack.Payload) != 0 { t.Fatalf("wrong payload. expected: %v; actual: %v", ack.Payload, v.Payload) } case <-nackCh: t.Fatalf("should not get a nack") default: t.Fatalf("message not sent") } if _, ok := m.ackHandlers[0]; ok { t.Fatalf("non-reaped handler") } } func TestMemberList_AliveNode_NewNode(t *testing.T) { ch := make(chan NodeEvent, 1) m := GetMemberlist(t) m.config.Events = &ChannelEventDelegate{ch} a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a, nil, false) if len(m.nodes) != 1 { t.Fatalf("should add node") } state, ok := m.nodeMap["test"] if !ok { t.Fatalf("should map node") } if state.Incarnation != 1 { t.Fatalf("bad incarnation") } if state.State != stateAlive { t.Fatalf("bad state") } if time.Now().Sub(state.StateChange) > time.Second { t.Fatalf("bad change delta") } // Check for a join message select { case e := <-ch: if e.Node.Name != "test" { t.Fatalf("bad node name") } default: t.Fatalf("no join message") } // Check a broad cast is queued if m.broadcasts.NumQueued() != 1 { t.Fatalf("expected queued message") } } func TestMemberList_AliveNode_SuspectNode(t *testing.T) { ch := make(chan NodeEvent, 1) m := GetMemberlist(t) a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a, nil, false) // Listen only after first join m.config.Events = &ChannelEventDelegate{ch} // Make suspect state := m.nodeMap["test"] state.State = stateSuspect state.StateChange = state.StateChange.Add(-time.Hour) // Old incarnation number, should not change m.aliveNode(&a, nil, false) if state.State != stateSuspect { t.Fatalf("update with old incarnation!") } // Should reset to alive now a.Incarnation = 2 m.aliveNode(&a, nil, false) if state.State != stateAlive { t.Fatalf("no update with new incarnation!") } if time.Now().Sub(state.StateChange) > time.Second { t.Fatalf("bad change delta") } // Check for a no join message select { case <-ch: t.Fatalf("got bad join message") default: } // Check a broad cast is queued if m.broadcasts.NumQueued() != 1 { t.Fatalf("expected queued message") } } func TestMemberList_AliveNode_Idempotent(t *testing.T) { ch := make(chan NodeEvent, 1) m := GetMemberlist(t) a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a, nil, false) // Listen only after first join m.config.Events = &ChannelEventDelegate{ch} // Make suspect state := m.nodeMap["test"] stateTime := state.StateChange // Should reset to alive now a.Incarnation = 2 m.aliveNode(&a, nil, false) if state.State != stateAlive { t.Fatalf("non idempotent") } if stateTime != state.StateChange { t.Fatalf("should not change state") } // Check for a no join message select { case <-ch: t.Fatalf("got bad join message") default: } // Check a broad cast is queued if m.broadcasts.NumQueued() != 1 { t.Fatalf("expected only one queued message") } } // Serf Bug: GH-58, Meta data does not update func TestMemberList_AliveNode_ChangeMeta(t *testing.T) { ch := make(chan NodeEvent, 1) m := GetMemberlist(t) a := alive{ Node: "test", Addr: []byte{127, 0, 0, 1}, Meta: []byte("val1"), Incarnation: 1} m.aliveNode(&a, nil, false) // Listen only after first join m.config.Events = &ChannelEventDelegate{ch} // Make suspect state := m.nodeMap["test"] // Should reset to alive now a.Incarnation = 2 a.Meta = []byte("val2") m.aliveNode(&a, nil, false) // Check updates if bytes.Compare(state.Meta, a.Meta) != 0 { t.Fatalf("meta did not update") } // Check for a NotifyUpdate select { case e := <-ch: if e.Event != NodeUpdate { t.Fatalf("bad event: %v", e) } if e.Node != &state.Node { t.Fatalf("bad event: %v", e) } if bytes.Compare(e.Node.Meta, a.Meta) != 0 { t.Fatalf("meta did not update") } default: t.Fatalf("missing event!") } } func TestMemberList_AliveNode_Refute(t *testing.T) { m := GetMemberlist(t) a := alive{Node: m.config.Name, Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a, nil, true) // Clear queue m.broadcasts.Reset() // Conflicting alive s := alive{ Node: m.config.Name, Addr: []byte{127, 0, 0, 1}, Incarnation: 2, Meta: []byte("foo"), } m.aliveNode(&s, nil, false) state := m.nodeMap[m.config.Name] if state.State != stateAlive { t.Fatalf("should still be alive") } if state.Meta != nil { t.Fatalf("meta should still be nil") } // Check a broad cast is queued if num := m.broadcasts.NumQueued(); num != 1 { t.Fatalf("expected only one queued message: %d", num) } // Should be alive mesg if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != aliveMsg { t.Fatalf("expected queued alive msg") } } func TestMemberList_SuspectNode_NoNode(t *testing.T) { m := GetMemberlist(t) s := suspect{Node: "test", Incarnation: 1} m.suspectNode(&s) if len(m.nodes) != 0 { t.Fatalf("don't expect nodes") } } func TestMemberList_SuspectNode(t *testing.T) { m := GetMemberlist(t) m.config.ProbeInterval = time.Millisecond m.config.SuspicionMult = 1 a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a, nil, false) state := m.nodeMap["test"] state.StateChange = state.StateChange.Add(-time.Hour) s := suspect{Node: "test", Incarnation: 1} m.suspectNode(&s) if state.State != stateSuspect { t.Fatalf("Bad state") } change := state.StateChange if time.Now().Sub(change) > time.Second { t.Fatalf("bad change delta") } // Check a broad cast is queued if m.broadcasts.NumQueued() != 1 { t.Fatalf("expected only one queued message") } // Check its a suspect message if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != suspectMsg { t.Fatalf("expected queued suspect msg") } // Wait for the timeout time.Sleep(10 * time.Millisecond) if state.State != stateDead { t.Fatalf("Bad state") } if time.Now().Sub(state.StateChange) > time.Second { t.Fatalf("bad change delta") } if !state.StateChange.After(change) { t.Fatalf("should increment time") } // Check a broad cast is queued if m.broadcasts.NumQueued() != 1 { t.Fatalf("expected only one queued message") } // Check its a suspect message if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != deadMsg { t.Fatalf("expected queued dead msg") } } func TestMemberList_SuspectNode_DoubleSuspect(t *testing.T) { m := GetMemberlist(t) a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a, nil, false) state := m.nodeMap["test"] state.StateChange = state.StateChange.Add(-time.Hour) s := suspect{Node: "test", Incarnation: 1} m.suspectNode(&s) if state.State != stateSuspect { t.Fatalf("Bad state") } change := state.StateChange if time.Now().Sub(change) > time.Second { t.Fatalf("bad change delta") } // clear the broadcast queue m.broadcasts.Reset() // Suspect again m.suspectNode(&s) if state.StateChange != change { t.Fatalf("unexpected state change") } // Check a broad cast is queued if m.broadcasts.NumQueued() != 0 { t.Fatalf("expected only one queued message") } } func TestMemberList_SuspectNode_OldSuspect(t *testing.T) { m := GetMemberlist(t) a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 10} m.aliveNode(&a, nil, false) state := m.nodeMap["test"] state.StateChange = state.StateChange.Add(-time.Hour) // Clear queue m.broadcasts.Reset() s := suspect{Node: "test", Incarnation: 1} m.suspectNode(&s) if state.State != stateAlive { t.Fatalf("Bad state") } // Check a broad cast is queued if m.broadcasts.NumQueued() != 0 { t.Fatalf("expected only one queued message") } } func TestMemberList_SuspectNode_Refute(t *testing.T) { m := GetMemberlist(t) a := alive{Node: m.config.Name, Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a, nil, true) // Clear queue m.broadcasts.Reset() // Make sure health is in a good state if score := m.GetHealthScore(); score != 0 { t.Fatalf("bad: %d", score) } s := suspect{Node: m.config.Name, Incarnation: 1} m.suspectNode(&s) state := m.nodeMap[m.config.Name] if state.State != stateAlive { t.Fatalf("should still be alive") } // Check a broad cast is queued if m.broadcasts.NumQueued() != 1 { t.Fatalf("expected only one queued message") } // Should be alive mesg if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != aliveMsg { t.Fatalf("expected queued alive msg") } // Health should have been dinged if score := m.GetHealthScore(); score != 1 { t.Fatalf("bad: %d", score) } } func TestMemberList_DeadNode_NoNode(t *testing.T) { m := GetMemberlist(t) d := dead{Node: "test", Incarnation: 1} m.deadNode(&d) if len(m.nodes) != 0 { t.Fatalf("don't expect nodes") } } func TestMemberList_DeadNode(t *testing.T) { ch := make(chan NodeEvent, 1) m := GetMemberlist(t) m.config.Events = &ChannelEventDelegate{ch} a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a, nil, false) // Read the join event <-ch state := m.nodeMap["test"] state.StateChange = state.StateChange.Add(-time.Hour) d := dead{Node: "test", Incarnation: 1} m.deadNode(&d) if state.State != stateDead { t.Fatalf("Bad state") } change := state.StateChange if time.Now().Sub(change) > time.Second { t.Fatalf("bad change delta") } select { case leave := <-ch: if leave.Event != NodeLeave || leave.Node.Name != "test" { t.Fatalf("bad node name") } default: t.Fatalf("no leave message") } // Check a broad cast is queued if m.broadcasts.NumQueued() != 1 { t.Fatalf("expected only one queued message") } // Check its a suspect message if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != deadMsg { t.Fatalf("expected queued dead msg") } } func TestMemberList_DeadNode_Double(t *testing.T) { ch := make(chan NodeEvent, 1) m := GetMemberlist(t) a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a, nil, false) state := m.nodeMap["test"] state.StateChange = state.StateChange.Add(-time.Hour) d := dead{Node: "test", Incarnation: 1} m.deadNode(&d) // Clear queue m.broadcasts.Reset() // Notify after the first dead m.config.Events = &ChannelEventDelegate{ch} // Should do nothing d.Incarnation = 2 m.deadNode(&d) select { case <-ch: t.Fatalf("should not get leave") default: } // Check a broad cast is queued if m.broadcasts.NumQueued() != 0 { t.Fatalf("expected only one queued message") } } func TestMemberList_DeadNode_OldDead(t *testing.T) { m := GetMemberlist(t) a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 10} m.aliveNode(&a, nil, false) state := m.nodeMap["test"] state.StateChange = state.StateChange.Add(-time.Hour) d := dead{Node: "test", Incarnation: 1} m.deadNode(&d) if state.State != stateAlive { t.Fatalf("Bad state") } } func TestMemberList_DeadNode_AliveReplay(t *testing.T) { m := GetMemberlist(t) a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 10} m.aliveNode(&a, nil, false) d := dead{Node: "test", Incarnation: 10} m.deadNode(&d) // Replay alive at same incarnation m.aliveNode(&a, nil, false) // Should remain dead state, ok := m.nodeMap["test"] if ok && state.State != stateDead { t.Fatalf("Bad state") } } func TestMemberList_DeadNode_Refute(t *testing.T) { m := GetMemberlist(t) a := alive{Node: m.config.Name, Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a, nil, true) // Clear queue m.broadcasts.Reset() // Make sure health is in a good state if score := m.GetHealthScore(); score != 0 { t.Fatalf("bad: %d", score) } d := dead{Node: m.config.Name, Incarnation: 1} m.deadNode(&d) state := m.nodeMap[m.config.Name] if state.State != stateAlive { t.Fatalf("should still be alive") } // Check a broad cast is queued if m.broadcasts.NumQueued() != 1 { t.Fatalf("expected only one queued message") } // Should be alive mesg if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != aliveMsg { t.Fatalf("expected queued alive msg") } // We should have been dinged if score := m.GetHealthScore(); score != 1 { t.Fatalf("bad: %d", score) } } func TestMemberList_MergeState(t *testing.T) { m := GetMemberlist(t) a1 := alive{Node: "test1", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} m.aliveNode(&a1, nil, false) a2 := alive{Node: "test2", Addr: []byte{127, 0, 0, 2}, Incarnation: 1} m.aliveNode(&a2, nil, false) a3 := alive{Node: "test3", Addr: []byte{127, 0, 0, 3}, Incarnation: 1} m.aliveNode(&a3, nil, false) s := suspect{Node: "test1", Incarnation: 1} m.suspectNode(&s) remote := []pushNodeState{ pushNodeState{ Name: "test1", Addr: []byte{127, 0, 0, 1}, Incarnation: 2, State: stateAlive, }, pushNodeState{ Name: "test2", Addr: []byte{127, 0, 0, 2}, Incarnation: 1, State: stateSuspect, }, pushNodeState{ Name: "test3", Addr: []byte{127, 0, 0, 3}, Incarnation: 1, State: stateDead, }, pushNodeState{ Name: "test4", Addr: []byte{127, 0, 0, 4}, Incarnation: 2, State: stateAlive, }, } // Listen for changes eventCh := make(chan NodeEvent, 1) m.config.Events = &ChannelEventDelegate{eventCh} // Merge remote state m.mergeState(remote) // Check the states state := m.nodeMap["test1"] if state.State != stateAlive || state.Incarnation != 2 { t.Fatalf("Bad state %v", state) } state = m.nodeMap["test2"] if state.State != stateSuspect || state.Incarnation != 1 { t.Fatalf("Bad state %v", state) } state = m.nodeMap["test3"] if state.State != stateSuspect { t.Fatalf("Bad state %v", state) } state = m.nodeMap["test4"] if state.State != stateAlive || state.Incarnation != 2 { t.Fatalf("Bad state %v", state) } // Check the channels select { case e := <-eventCh: if e.Event != NodeJoin || e.Node.Name != "test4" { t.Fatalf("bad node %v", e) } default: t.Fatalf("Expect join") } select { case e := <-eventCh: t.Fatalf("Unexpect event: %v", e) default: } } func TestMemberlist_Gossip(t *testing.T) { ch := make(chan NodeEvent, 3) addr1 := getBindAddr() addr2 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.GossipInterval = time.Millisecond }) m2 := HostMemberlist(addr2.String(), t, func(c *Config) { c.Events = &ChannelEventDelegate{ch} c.GossipInterval = time.Millisecond }) defer m1.Shutdown() defer m2.Shutdown() a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a2, nil, false) a3 := alive{Node: "172.0.0.1", Addr: []byte{172, 0, 0, 1}, Incarnation: 1} m1.aliveNode(&a3, nil, false) // Gossip should send all this to m2 m1.gossip() for i := 0; i < 3; i++ { select { case <-ch: case <-time.After(50 * time.Millisecond): t.Fatalf("timeout") } } } func TestMemberlist_GossipToDead(t *testing.T) { ch := make(chan NodeEvent, 2) addr1 := getBindAddr() addr2 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.GossipInterval = time.Millisecond c.GossipToTheDeadTime = 100 * time.Millisecond }) m2 := HostMemberlist(addr2.String(), t, func(c *Config) { c.Events = &ChannelEventDelegate{ch} }) defer m1.Shutdown() defer m2.Shutdown() a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a2, nil, false) // Shouldn't send anything to m2 here, node has been dead for 2x the GossipToTheDeadTime m1.nodeMap[addr2.String()].State = stateDead m1.nodeMap[addr2.String()].StateChange = time.Now().Add(-200 * time.Millisecond) m1.gossip() select { case <-ch: t.Fatalf("shouldn't get gossip") case <-time.After(50 * time.Millisecond): } // Should gossip to m2 because its state has changed within GossipToTheDeadTime m1.nodeMap[addr2.String()].StateChange = time.Now().Add(-20 * time.Millisecond) m1.gossip() for i := 0; i < 2; i++ { select { case <-ch: case <-time.After(50 * time.Millisecond): t.Fatalf("timeout") } } } func TestMemberlist_PushPull(t *testing.T) { addr1 := getBindAddr() addr2 := getBindAddr() ip1 := []byte(addr1) ip2 := []byte(addr2) ch := make(chan NodeEvent, 3) m1 := HostMemberlist(addr1.String(), t, func(c *Config) { c.GossipInterval = 10 * time.Second c.PushPullInterval = time.Millisecond }) m2 := HostMemberlist(addr2.String(), t, func(c *Config) { c.GossipInterval = 10 * time.Second c.Events = &ChannelEventDelegate{ch} }) defer m1.Shutdown() defer m2.Shutdown() a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} m1.aliveNode(&a1, nil, true) a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} m1.aliveNode(&a2, nil, false) // Gossip should send all this to m2 m1.pushPull() for i := 0; i < 2; i++ { select { case <-ch: case <-time.After(10 * time.Millisecond): t.Fatalf("timeout") } } } func TestVerifyProtocol(t *testing.T) { cases := []struct { Anodes [][3]uint8 Bnodes [][3]uint8 expected bool }{ // Both running identical everything { Anodes: [][3]uint8{ {0, 0, 0}, }, Bnodes: [][3]uint8{ {0, 0, 0}, }, expected: true, }, // One can understand newer, but speaking same protocol { Anodes: [][3]uint8{ {0, 0, 0}, }, Bnodes: [][3]uint8{ {0, 1, 0}, }, expected: true, }, // One is speaking outside the range { Anodes: [][3]uint8{ {0, 0, 0}, }, Bnodes: [][3]uint8{ {1, 1, 1}, }, expected: false, }, // Transitively outside the range { Anodes: [][3]uint8{ {0, 1, 0}, {0, 2, 1}, }, Bnodes: [][3]uint8{ {1, 3, 1}, }, expected: false, }, // Multi-node { Anodes: [][3]uint8{ {0, 3, 2}, {0, 2, 0}, }, Bnodes: [][3]uint8{ {0, 2, 1}, {0, 5, 0}, }, expected: true, }, } for _, tc := range cases { aCore := make([][6]uint8, len(tc.Anodes)) aApp := make([][6]uint8, len(tc.Anodes)) for i, n := range tc.Anodes { aCore[i] = [6]uint8{n[0], n[1], n[2], 0, 0, 0} aApp[i] = [6]uint8{0, 0, 0, n[0], n[1], n[2]} } bCore := make([][6]uint8, len(tc.Bnodes)) bApp := make([][6]uint8, len(tc.Bnodes)) for i, n := range tc.Bnodes { bCore[i] = [6]uint8{n[0], n[1], n[2], 0, 0, 0} bApp[i] = [6]uint8{0, 0, 0, n[0], n[1], n[2]} } // Test core protocol verification testVerifyProtocolSingle(t, aCore, bCore, tc.expected) testVerifyProtocolSingle(t, bCore, aCore, tc.expected) // Test app protocol verification testVerifyProtocolSingle(t, aApp, bApp, tc.expected) testVerifyProtocolSingle(t, bApp, aApp, tc.expected) } } func testVerifyProtocolSingle(t *testing.T, A [][6]uint8, B [][6]uint8, expect bool) { m := GetMemberlist(t) defer m.Shutdown() m.nodes = make([]*nodeState, len(A)) for i, n := range A { m.nodes[i] = &nodeState{ Node: Node{ PMin: n[0], PMax: n[1], PCur: n[2], DMin: n[3], DMax: n[4], DCur: n[5], }, } } remote := make([]pushNodeState, len(B)) for i, n := range B { remote[i] = pushNodeState{ Name: fmt.Sprintf("node %d", i), Vsn: []uint8{n[0], n[1], n[2], n[3], n[4], n[5]}, } } err := m.verifyProtocol(remote) if (err == nil) != expect { t.Fatalf("bad:\nA: %v\nB: %v\nErr: %s", A, B, err) } } memberlist-0.1.0/suspicion.go000066400000000000000000000103161307374264600162330ustar00rootroot00000000000000package memberlist import ( "math" "sync/atomic" "time" ) // suspicion manages the suspect timer for a node and provides an interface // to accelerate the timeout as we get more independent confirmations that // a node is suspect. type suspicion struct { // n is the number of independent confirmations we've seen. This must // be updated using atomic instructions to prevent contention with the // timer callback. n int32 // k is the number of independent confirmations we'd like to see in // order to drive the timer to its minimum value. k int32 // min is the minimum timer value. min time.Duration // max is the maximum timer value. max time.Duration // start captures the timestamp when we began the timer. This is used // so we can calculate durations to feed the timer during updates in // a way the achieves the overall time we'd like. start time.Time // timer is the underlying timer that implements the timeout. timer *time.Timer // f is the function to call when the timer expires. We hold on to this // because there are cases where we call it directly. timeoutFn func() // confirmations is a map of "from" nodes that have confirmed a given // node is suspect. This prevents double counting. confirmations map[string]struct{} } // newSuspicion returns a timer started with the max time, and that will drive // to the min time after seeing k or more confirmations. The from node will be // excluded from confirmations since we might get our own suspicion message // gossiped back to us. The minimum time will be used if no confirmations are // called for (k <= 0). func newSuspicion(from string, k int, min time.Duration, max time.Duration, fn func(int)) *suspicion { s := &suspicion{ k: int32(k), min: min, max: max, confirmations: make(map[string]struct{}), } // Exclude the from node from any confirmations. s.confirmations[from] = struct{}{} // Pass the number of confirmations into the timeout function for // easy telemetry. s.timeoutFn = func() { fn(int(atomic.LoadInt32(&s.n))) } // If there aren't any confirmations to be made then take the min // time from the start. timeout := max if k < 1 { timeout = min } s.timer = time.AfterFunc(timeout, s.timeoutFn) // Capture the start time right after starting the timer above so // we should always err on the side of a little longer timeout if // there's any preemption that separates this and the step above. s.start = time.Now() return s } // remainingSuspicionTime takes the state variables of the suspicion timer and // calculates the remaining time to wait before considering a node dead. The // return value can be negative, so be prepared to fire the timer immediately in // that case. func remainingSuspicionTime(n, k int32, elapsed time.Duration, min, max time.Duration) time.Duration { frac := math.Log(float64(n)+1.0) / math.Log(float64(k)+1.0) raw := max.Seconds() - frac*(max.Seconds()-min.Seconds()) timeout := time.Duration(math.Floor(1000.0*raw)) * time.Millisecond if timeout < min { timeout = min } // We have to take into account the amount of time that has passed so // far, so we get the right overall timeout. return timeout - elapsed } // Confirm registers that a possibly new peer has also determined the given // node is suspect. This returns true if this was new information, and false // if it was a duplicate confirmation, or if we've got enough confirmations to // hit the minimum. func (s *suspicion) Confirm(from string) bool { // If we've got enough confirmations then stop accepting them. if atomic.LoadInt32(&s.n) >= s.k { return false } // Only allow one confirmation from each possible peer. if _, ok := s.confirmations[from]; ok { return false } s.confirmations[from] = struct{}{} // Compute the new timeout given the current number of confirmations and // adjust the timer. If the timeout becomes negative *and* we can cleanly // stop the timer then we will call the timeout function directly from // here. n := atomic.AddInt32(&s.n, 1) elapsed := time.Now().Sub(s.start) remaining := remainingSuspicionTime(n, s.k, elapsed, s.min, s.max) if s.timer.Stop() { if remaining > 0 { s.timer.Reset(remaining) } else { go s.timeoutFn() } } return true } memberlist-0.1.0/suspicion_test.go000066400000000000000000000106211307374264600172710ustar00rootroot00000000000000package memberlist import ( "testing" "time" ) func TestSuspicion_remainingSuspicionTime(t *testing.T) { cases := []struct { n int32 k int32 elapsed time.Duration min time.Duration max time.Duration expected time.Duration }{ {0, 3, 0, 2 * time.Second, 30 * time.Second, 30 * time.Second}, {1, 3, 2 * time.Second, 2 * time.Second, 30 * time.Second, 14 * time.Second}, {2, 3, 3 * time.Second, 2 * time.Second, 30 * time.Second, 4810 * time.Millisecond}, {3, 3, 4 * time.Second, 2 * time.Second, 30 * time.Second, -2 * time.Second}, {4, 3, 5 * time.Second, 2 * time.Second, 30 * time.Second, -3 * time.Second}, {5, 3, 10 * time.Second, 2 * time.Second, 30 * time.Second, -8 * time.Second}, } for i, c := range cases { remaining := remainingSuspicionTime(c.n, c.k, c.elapsed, c.min, c.max) if remaining != c.expected { t.Errorf("case %d: remaining %9.6f != expected %9.6f", i, remaining.Seconds(), c.expected.Seconds()) } } } func TestSuspicion_Timer(t *testing.T) { const k = 3 const min = 500 * time.Millisecond const max = 2 * time.Second type pair struct { from string newInfo bool } cases := []struct { numConfirmations int from string confirmations []pair expected time.Duration }{ { 0, "me", []pair{}, max, }, { 1, "me", []pair{ pair{"me", false}, pair{"foo", true}, }, 1250 * time.Millisecond, }, { 1, "me", []pair{ pair{"me", false}, pair{"foo", true}, pair{"foo", false}, pair{"foo", false}, }, 1250 * time.Millisecond, }, { 2, "me", []pair{ pair{"me", false}, pair{"foo", true}, pair{"bar", true}, }, 810 * time.Millisecond, }, { 3, "me", []pair{ pair{"me", false}, pair{"foo", true}, pair{"bar", true}, pair{"baz", true}, }, min, }, { 3, "me", []pair{ pair{"me", false}, pair{"foo", true}, pair{"bar", true}, pair{"baz", true}, pair{"zoo", false}, }, min, }, } for i, c := range cases { ch := make(chan time.Duration, 1) start := time.Now() f := func(numConfirmations int) { if numConfirmations != c.numConfirmations { t.Errorf("case %d: bad %d != %d", i, numConfirmations, c.numConfirmations) } ch <- time.Now().Sub(start) } // Create the timer and add the requested confirmations. Wait // the fudge amount to help make sure we calculate the timeout // overall, and don't accumulate extra time. s := newSuspicion(c.from, k, min, max, f) fudge := 25 * time.Millisecond for _, p := range c.confirmations { time.Sleep(fudge) if s.Confirm(p.from) != p.newInfo { t.Fatalf("case %d: newInfo mismatch for %s", i, p.from) } } // Wait until right before the timeout and make sure the // timer hasn't fired. already := time.Duration(len(c.confirmations)) * fudge time.Sleep(c.expected - already - fudge) select { case d := <-ch: t.Fatalf("case %d: should not have fired (%9.6f)", i, d.Seconds()) default: } // Wait through the timeout and a little after and make sure it // fires. time.Sleep(2 * fudge) select { case <-ch: default: t.Fatalf("case %d: should have fired", i) } // Confirm after to make sure it handles a negative remaining // time correctly and doesn't fire again. s.Confirm("late") time.Sleep(c.expected + 2*fudge) select { case d := <-ch: t.Fatalf("case %d: should not have fired (%9.6f)", i, d.Seconds()) default: } } } func TestSuspicion_Timer_ZeroK(t *testing.T) { ch := make(chan struct{}, 1) f := func(int) { ch <- struct{}{} } // This should select the min time since there are no expected // confirmations to accelerate the timer. s := newSuspicion("me", 0, 25*time.Millisecond, 30*time.Second, f) if s.Confirm("foo") { t.Fatalf("should not provide new information") } select { case <-ch: case <-time.After(50 * time.Millisecond): t.Fatalf("should have fired") } } func TestSuspicion_Timer_Immediate(t *testing.T) { ch := make(chan struct{}, 1) f := func(int) { ch <- struct{}{} } // This should underflow the timeout and fire immediately. s := newSuspicion("me", 1, 100*time.Millisecond, 30*time.Second, f) time.Sleep(200 * time.Millisecond) s.Confirm("foo") // Wait a little while since the function gets called in a goroutine. select { case <-ch: case <-time.After(25 * time.Millisecond): t.Fatalf("should have fired") } } memberlist-0.1.0/tag.sh000077500000000000000000000006171307374264600150050ustar00rootroot00000000000000#!/usr/bin/env bash set -e # The version must be supplied from the environment. Do not include the # leading "v". if [ -z $VERSION ]; then echo "Please specify a version." exit 1 fi # Generate the tag. echo "==> Tagging version $VERSION..." git commit --allow-empty -a --gpg-sign=348FFC4C -m "Release v$VERSION" git tag -a -m "Version $VERSION" -s -u 348FFC4C "v${VERSION}" master exit 0 memberlist-0.1.0/test/000077500000000000000000000000001307374264600146465ustar00rootroot00000000000000memberlist-0.1.0/test/setup_subnet.sh000077500000000000000000000007771307374264600177400ustar00rootroot00000000000000#!/bin/bash # # This script makes sure that 127.0.0.x is routable. On Darwin, there # is a bug that it isn't routable and this causes errors. # # Check if loopback is setup ping -c 1 -W 10 127.0.0.2 > /dev/null 2>&1 if [ $? -eq 0 ] then exit fi # If we're not on OS X, then error case $OSTYPE in darwin*) ;; *) echo "Can't setup interfaces on non-Mac. Error!" exit 1 ;; esac # Setup loopback for ((i=2;i<256;i++)) do sudo ifconfig lo0 alias 127.0.0.$i up done memberlist-0.1.0/todo.md000066400000000000000000000003231307374264600151540ustar00rootroot00000000000000# TODO * Dynamic RTT discovery * Compute 99th percentile for ping/ack * Better lower bound for ping/ack, faster failure detection * Dynamic MTU discovery * Prevent lost updates, increases efficiency memberlist-0.1.0/transport.go000066400000000000000000000052171307374264600162570ustar00rootroot00000000000000package memberlist import ( "net" "time" ) // Packet is used to provide some metadata about incoming packets from peers // over a packet connection, as well as the packet payload. type Packet struct { // Buf has the raw contents of the packet. Buf []byte // From has the address of the peer. This is an actual net.Addr so we // can expose some concrete details about incoming packets. From net.Addr // Timestamp is the time when the packet was received. This should be // taken as close as possible to the actual receipt time to help make an // accurate RTT measurements during probes. Timestamp time.Time } // Transport is used to abstract over communicating with other peers. The packet // interface is assumed to be best-effort and the stream interface is assumed to // be reliable. type Transport interface { // FinalAdvertiseAddr is given the user's configured values (which // might be empty) and returns the desired IP and port to advertise to // the rest of the cluster. FinalAdvertiseAddr(ip string, port int) (net.IP, int, error) // WriteTo is a packet-oriented interface that fires off the given // payload to the given address in a connectionless fashion. This should // return a time stamp that's as close as possible to when the packet // was transmitted to help make accurate RTT measurements during probes. // // This is similar to net.PacketConn, though we didn't want to expose // that full set of required methods to keep assumptions about the // underlying plumbing to a minimum. We also treat the address here as a // string, similar to Dial, so it's network neutral, so this usually is // in the form of "host:port". WriteTo(b []byte, addr string) (time.Time, error) // PacketCh returns a channel that can be read to receive incoming // packets from other peers. How this is set up for listening is left as // an exercise for the concrete transport implementations. PacketCh() <-chan *Packet // DialTimeout is used to create a connection that allows us to perform // two-way communication with a peer. This is generally more expensive // than packet connections so is used for more infrequent operations // such as anti-entropy or fallback probes if the packet-oriented probe // failed. DialTimeout(addr string, timeout time.Duration) (net.Conn, error) // StreamCh returns a channel that can be read to handle incoming stream // connections from other peers. How this is set up for listening is // left as an exercise for the concrete transport implementations. StreamCh() <-chan net.Conn // Shutdown is called when memberlist is shutting down; this gives the // transport a chance to clean up any listeners. Shutdown() error } memberlist-0.1.0/transport_test.go000066400000000000000000000044411307374264600173140ustar00rootroot00000000000000package memberlist import ( "bytes" "testing" "time" ) func TestTransport_Join(t *testing.T) { net := &MockNetwork{} t1 := net.NewTransport() c1 := DefaultLANConfig() c1.Name = "node1" c1.Transport = t1 m1, err := Create(c1) if err != nil { t.Fatalf("err: %v", err) } m1.setAlive() m1.schedule() defer m1.Shutdown() c2 := DefaultLANConfig() c2.Name = "node2" c2.Transport = net.NewTransport() m2, err := Create(c2) if err != nil { t.Fatalf("err: %v", err) } m2.setAlive() m2.schedule() defer m2.Shutdown() num, err := m2.Join([]string{t1.addr.String()}) if num != 1 { t.Fatalf("bad: %d", num) } if err != nil { t.Fatalf("err: %v", err) } if len(m2.Members()) != 2 { t.Fatalf("bad: %v", m2.Members()) } if m2.estNumNodes() != 2 { t.Fatalf("bad: %v", m2.Members()) } } func TestTransport_Send(t *testing.T) { net := &MockNetwork{} t1 := net.NewTransport() d1 := &MockDelegate{} c1 := DefaultLANConfig() c1.Name = "node1" c1.Transport = t1 c1.Delegate = d1 m1, err := Create(c1) if err != nil { t.Fatalf("err: %v", err) } m1.setAlive() m1.schedule() defer m1.Shutdown() c2 := DefaultLANConfig() c2.Name = "node2" c2.Transport = net.NewTransport() m2, err := Create(c2) if err != nil { t.Fatalf("err: %v", err) } m2.setAlive() m2.schedule() defer m2.Shutdown() num, err := m2.Join([]string{t1.addr.String()}) if num != 1 { t.Fatalf("bad: %d", num) } if err != nil { t.Fatalf("err: %v", err) } if err := m2.SendTo(t1.addr, []byte("SendTo")); err != nil { t.Fatalf("err: %v", err) } var n1 *Node for _, n := range m2.Members() { if n.Name == c1.Name { n1 = n break } } if n1 == nil { t.Fatalf("bad") } if err := m2.SendToUDP(n1, []byte("SendToUDP")); err != nil { t.Fatalf("err: %v", err) } if err := m2.SendToTCP(n1, []byte("SendToTCP")); err != nil { t.Fatalf("err: %v", err) } if err := m2.SendBestEffort(n1, []byte("SendBestEffort")); err != nil { t.Fatalf("err: %v", err) } if err := m2.SendReliable(n1, []byte("SendReliable")); err != nil { t.Fatalf("err: %v", err) } time.Sleep(100 * time.Millisecond) received := bytes.Join(d1.msgs, []byte("|")) expected := []byte("SendTo|SendToUDP|SendToTCP|SendBestEffort|SendReliable") if !bytes.Equal(received, expected) { t.Fatalf("bad: %s", received) } } memberlist-0.1.0/util.go000066400000000000000000000166431307374264600152050ustar00rootroot00000000000000package memberlist import ( "bytes" "compress/lzw" "encoding/binary" "fmt" "io" "math" "math/rand" "net" "strconv" "strings" "time" "github.com/hashicorp/go-msgpack/codec" "github.com/sean-/seed" ) // pushPullScale is the minimum number of nodes // before we start scaling the push/pull timing. The scale // effect is the log2(Nodes) - log2(pushPullScale). This means // that the 33rd node will cause us to double the interval, // while the 65th will triple it. const pushPullScaleThreshold = 32 const ( // Constant litWidth 2-8 lzwLitWidth = 8 ) func init() { seed.Init() } // Decode reverses the encode operation on a byte slice input func decode(buf []byte, out interface{}) error { r := bytes.NewReader(buf) hd := codec.MsgpackHandle{} dec := codec.NewDecoder(r, &hd) return dec.Decode(out) } // Encode writes an encoded object to a new bytes buffer func encode(msgType messageType, in interface{}) (*bytes.Buffer, error) { buf := bytes.NewBuffer(nil) buf.WriteByte(uint8(msgType)) hd := codec.MsgpackHandle{} enc := codec.NewEncoder(buf, &hd) err := enc.Encode(in) return buf, err } // Returns a random offset between 0 and n func randomOffset(n int) int { if n == 0 { return 0 } return int(rand.Uint32() % uint32(n)) } // suspicionTimeout computes the timeout that should be used when // a node is suspected func suspicionTimeout(suspicionMult, n int, interval time.Duration) time.Duration { nodeScale := math.Max(1.0, math.Log10(math.Max(1.0, float64(n)))) // multiply by 1000 to keep some precision because time.Duration is an int64 type timeout := time.Duration(suspicionMult) * time.Duration(nodeScale*1000) * interval / 1000 return timeout } // retransmitLimit computes the limit of retransmissions func retransmitLimit(retransmitMult, n int) int { nodeScale := math.Ceil(math.Log10(float64(n + 1))) limit := retransmitMult * int(nodeScale) return limit } // shuffleNodes randomly shuffles the input nodes using the Fisher-Yates shuffle func shuffleNodes(nodes []*nodeState) { n := len(nodes) for i := n - 1; i > 0; i-- { j := rand.Intn(i + 1) nodes[i], nodes[j] = nodes[j], nodes[i] } } // pushPushScale is used to scale the time interval at which push/pull // syncs take place. It is used to prevent network saturation as the // cluster size grows func pushPullScale(interval time.Duration, n int) time.Duration { // Don't scale until we cross the threshold if n <= pushPullScaleThreshold { return interval } multiplier := math.Ceil(math.Log2(float64(n))-math.Log2(pushPullScaleThreshold)) + 1.0 return time.Duration(multiplier) * interval } // moveDeadNodes moves nodes that are dead and beyond the gossip to the dead interval // to the end of the slice and returns the index of the first moved node. func moveDeadNodes(nodes []*nodeState, gossipToTheDeadTime time.Duration) int { numDead := 0 n := len(nodes) for i := 0; i < n-numDead; i++ { if nodes[i].State != stateDead { continue } // Respect the gossip to the dead interval if time.Since(nodes[i].StateChange) <= gossipToTheDeadTime { continue } // Move this node to the end nodes[i], nodes[n-numDead-1] = nodes[n-numDead-1], nodes[i] numDead++ i-- } return n - numDead } // kRandomNodes is used to select up to k random nodes, excluding any nodes where // the filter function returns true. It is possible that less than k nodes are // returned. func kRandomNodes(k int, nodes []*nodeState, filterFn func(*nodeState) bool) []*nodeState { n := len(nodes) kNodes := make([]*nodeState, 0, k) OUTER: // Probe up to 3*n times, with large n this is not necessary // since k << n, but with small n we want search to be // exhaustive for i := 0; i < 3*n && len(kNodes) < k; i++ { // Get random node idx := randomOffset(n) node := nodes[idx] // Give the filter a shot at it. if filterFn != nil && filterFn(node) { continue OUTER } // Check if we have this node already for j := 0; j < len(kNodes); j++ { if node == kNodes[j] { continue OUTER } } // Append the node kNodes = append(kNodes, node) } return kNodes } // makeCompoundMessage takes a list of messages and generates // a single compound message containing all of them func makeCompoundMessage(msgs [][]byte) *bytes.Buffer { // Create a local buffer buf := bytes.NewBuffer(nil) // Write out the type buf.WriteByte(uint8(compoundMsg)) // Write out the number of message buf.WriteByte(uint8(len(msgs))) // Add the message lengths for _, m := range msgs { binary.Write(buf, binary.BigEndian, uint16(len(m))) } // Append the messages for _, m := range msgs { buf.Write(m) } return buf } // decodeCompoundMessage splits a compound message and returns // the slices of individual messages. Also returns the number // of truncated messages and any potential error func decodeCompoundMessage(buf []byte) (trunc int, parts [][]byte, err error) { if len(buf) < 1 { err = fmt.Errorf("missing compound length byte") return } numParts := uint8(buf[0]) buf = buf[1:] // Check we have enough bytes if len(buf) < int(numParts*2) { err = fmt.Errorf("truncated len slice") return } // Decode the lengths lengths := make([]uint16, numParts) for i := 0; i < int(numParts); i++ { lengths[i] = binary.BigEndian.Uint16(buf[i*2 : i*2+2]) } buf = buf[numParts*2:] // Split each message for idx, msgLen := range lengths { if len(buf) < int(msgLen) { trunc = int(numParts) - idx return } // Extract the slice, seek past on the buffer slice := buf[:msgLen] buf = buf[msgLen:] parts = append(parts, slice) } return } // Given a string of the form "host", "host:port", // "ipv6::addr" or "[ipv6::address]:port", // return true if the string includes a port. func hasPort(s string) bool { last := strings.LastIndex(s, ":") if last == -1 { return false } if s[0] == '[' { return s[last-1] == ']' } return strings.Index(s, ":") == last } // compressPayload takes an opaque input buffer, compresses it // and wraps it in a compress{} message that is encoded. func compressPayload(inp []byte) (*bytes.Buffer, error) { var buf bytes.Buffer compressor := lzw.NewWriter(&buf, lzw.LSB, lzwLitWidth) _, err := compressor.Write(inp) if err != nil { return nil, err } // Ensure we flush everything out if err := compressor.Close(); err != nil { return nil, err } // Create a compressed message c := compress{ Algo: lzwAlgo, Buf: buf.Bytes(), } return encode(compressMsg, &c) } // decompressPayload is used to unpack an encoded compress{} // message and return its payload uncompressed func decompressPayload(msg []byte) ([]byte, error) { // Decode the message var c compress if err := decode(msg, &c); err != nil { return nil, err } return decompressBuffer(&c) } // decompressBuffer is used to decompress the buffer of // a single compress message, handling multiple algorithms func decompressBuffer(c *compress) ([]byte, error) { // Verify the algorithm if c.Algo != lzwAlgo { return nil, fmt.Errorf("Cannot decompress unknown algorithm %d", c.Algo) } // Create a uncompressor uncomp := lzw.NewReader(bytes.NewReader(c.Buf), lzw.LSB, lzwLitWidth) defer uncomp.Close() // Read all the data var b bytes.Buffer _, err := io.Copy(&b, uncomp) if err != nil { return nil, err } // Return the uncompressed bytes return b.Bytes(), nil } // joinHostPort returns the host:port form of an address, for use with a // transport. func joinHostPort(host string, port uint16) string { return net.JoinHostPort(host, strconv.Itoa(int(port))) } memberlist-0.1.0/util_test.go000066400000000000000000000160751307374264600162430ustar00rootroot00000000000000package memberlist import ( "fmt" "reflect" "testing" "time" ) func Test_hasPort(t *testing.T) { cases := []struct { s string expected bool }{ {"", false}, {":80", true}, {"127.0.0.1", false}, {"127.0.0.1:80", true}, {"::1", false}, {"2001:db8:a0b:12f0::1", false}, {"[2001:db8:a0b:12f0::1]", false}, {"[2001:db8:a0b:12f0::1]:80", true}, } for _, c := range cases { if hasPort(c.s) != c.expected { t.Fatalf("bad: '%s' hasPort was not %v", c.s, c.expected) } } } func TestEncodeDecode(t *testing.T) { msg := &ping{SeqNo: 100} buf, err := encode(pingMsg, msg) if err != nil { t.Fatalf("unexpected err: %s", err) } var out ping if err := decode(buf.Bytes()[1:], &out); err != nil { t.Fatalf("unexpected err: %s", err) } if msg.SeqNo != out.SeqNo { t.Fatalf("bad sequence no") } } func TestRandomOffset(t *testing.T) { vals := make(map[int]struct{}) for i := 0; i < 100; i++ { offset := randomOffset(2 << 30) if _, ok := vals[offset]; ok { t.Fatalf("got collision") } vals[offset] = struct{}{} } } func TestRandomOffset_Zero(t *testing.T) { offset := randomOffset(0) if offset != 0 { t.Fatalf("bad offset") } } func TestSuspicionTimeout(t *testing.T) { timeouts := map[int]time.Duration{ 5: 1000 * time.Millisecond, 10: 1000 * time.Millisecond, 50: 1698 * time.Millisecond, 100: 2000 * time.Millisecond, 500: 2698 * time.Millisecond, 1000: 3000 * time.Millisecond, } for n, expected := range timeouts { timeout := suspicionTimeout(3, n, time.Second) / 3 if timeout != expected { t.Fatalf("bad: %v, %v", expected, timeout) } } } func TestRetransmitLimit(t *testing.T) { lim := retransmitLimit(3, 0) if lim != 0 { t.Fatalf("bad val %v", lim) } lim = retransmitLimit(3, 1) if lim != 3 { t.Fatalf("bad val %v", lim) } lim = retransmitLimit(3, 99) if lim != 6 { t.Fatalf("bad val %v", lim) } } func TestShuffleNodes(t *testing.T) { orig := []*nodeState{ &nodeState{ State: stateDead, }, &nodeState{ State: stateAlive, }, &nodeState{ State: stateAlive, }, &nodeState{ State: stateDead, }, &nodeState{ State: stateAlive, }, &nodeState{ State: stateAlive, }, &nodeState{ State: stateDead, }, &nodeState{ State: stateAlive, }, } nodes := make([]*nodeState, len(orig)) copy(nodes[:], orig[:]) if !reflect.DeepEqual(nodes, orig) { t.Fatalf("should match") } shuffleNodes(nodes) if reflect.DeepEqual(nodes, orig) { t.Fatalf("should not match") } } func TestPushPullScale(t *testing.T) { sec := time.Second for i := 0; i <= 32; i++ { if s := pushPullScale(sec, i); s != sec { t.Fatalf("Bad time scale: %v", s) } } for i := 33; i <= 64; i++ { if s := pushPullScale(sec, i); s != 2*sec { t.Fatalf("Bad time scale: %v", s) } } for i := 65; i <= 128; i++ { if s := pushPullScale(sec, i); s != 3*sec { t.Fatalf("Bad time scale: %v", s) } } } func TestMoveDeadNodes(t *testing.T) { nodes := []*nodeState{ &nodeState{ State: stateDead, StateChange: time.Now().Add(-20 * time.Second), }, &nodeState{ State: stateAlive, StateChange: time.Now().Add(-20 * time.Second), }, // This dead node should not be moved, as its state changed // less than the specified GossipToTheDead time ago &nodeState{ State: stateDead, StateChange: time.Now().Add(-10 * time.Second), }, &nodeState{ State: stateAlive, StateChange: time.Now().Add(-20 * time.Second), }, &nodeState{ State: stateDead, StateChange: time.Now().Add(-20 * time.Second), }, &nodeState{ State: stateAlive, StateChange: time.Now().Add(-20 * time.Second), }, } idx := moveDeadNodes(nodes, (15 * time.Second)) if idx != 4 { t.Fatalf("bad index") } for i := 0; i < idx; i++ { switch i { case 2: // Recently dead node remains at index 2, // since nodes are swapped out to move to end. if nodes[i].State != stateDead { t.Fatalf("Bad state %d", i) } default: if nodes[i].State != stateAlive { t.Fatalf("Bad state %d", i) } } } for i := idx; i < len(nodes); i++ { if nodes[i].State != stateDead { t.Fatalf("Bad state %d", i) } } } func TestKRandomNodes(t *testing.T) { nodes := []*nodeState{} for i := 0; i < 90; i++ { // Half the nodes are in a bad state state := stateAlive switch i % 3 { case 0: state = stateAlive case 1: state = stateSuspect case 2: state = stateDead } nodes = append(nodes, &nodeState{ Node: Node{ Name: fmt.Sprintf("test%d", i), }, State: state, }) } filterFunc := func(n *nodeState) bool { if n.Name == "test0" || n.State != stateAlive { return true } return false } s1 := kRandomNodes(3, nodes, filterFunc) s2 := kRandomNodes(3, nodes, filterFunc) s3 := kRandomNodes(3, nodes, filterFunc) if reflect.DeepEqual(s1, s2) { t.Fatalf("unexpected equal") } if reflect.DeepEqual(s1, s3) { t.Fatalf("unexpected equal") } if reflect.DeepEqual(s2, s3) { t.Fatalf("unexpected equal") } for _, s := range [][]*nodeState{s1, s2, s3} { if len(s) != 3 { t.Fatalf("bad len") } for _, n := range s { if n.Name == "test0" { t.Fatalf("Bad name") } if n.State != stateAlive { t.Fatalf("Bad state") } } } } func TestMakeCompoundMessage(t *testing.T) { msg := &ping{SeqNo: 100} buf, err := encode(pingMsg, msg) if err != nil { t.Fatalf("unexpected err: %s", err) } msgs := [][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()} compound := makeCompoundMessage(msgs) if compound.Len() != 3*buf.Len()+3*compoundOverhead+compoundHeaderOverhead { t.Fatalf("bad len") } } func TestDecodeCompoundMessage(t *testing.T) { msg := &ping{SeqNo: 100} buf, err := encode(pingMsg, msg) if err != nil { t.Fatalf("unexpected err: %s", err) } msgs := [][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()} compound := makeCompoundMessage(msgs) trunc, parts, err := decodeCompoundMessage(compound.Bytes()[1:]) if err != nil { t.Fatalf("unexpected err: %s", err) } if trunc != 0 { t.Fatalf("should not truncate") } if len(parts) != 3 { t.Fatalf("bad parts") } for _, p := range parts { if len(p) != buf.Len() { t.Fatalf("bad part len") } } } func TestDecodeCompoundMessage_Trunc(t *testing.T) { msg := &ping{SeqNo: 100} buf, err := encode(pingMsg, msg) if err != nil { t.Fatalf("unexpected err: %s", err) } msgs := [][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()} compound := makeCompoundMessage(msgs) trunc, parts, err := decodeCompoundMessage(compound.Bytes()[1:38]) if err != nil { t.Fatalf("unexpected err: %s", err) } if trunc != 1 { t.Fatalf("truncate: %d", trunc) } if len(parts) != 2 { t.Fatalf("bad parts") } for _, p := range parts { if len(p) != buf.Len() { t.Fatalf("bad part len") } } } func TestCompressDecompressPayload(t *testing.T) { buf, err := compressPayload([]byte("testing")) if err != nil { t.Fatalf("unexpected err: %s", err) } decomp, err := decompressPayload(buf.Bytes()[1:]) if err != nil { t.Fatalf("unexpected err: %s", err) } if !reflect.DeepEqual(decomp, []byte("testing")) { t.Fatalf("bad payload: %v", decomp) } }