001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase;
019
020import java.io.Closeable;
021import java.io.IOException;
022import org.apache.hadoop.conf.Configurable;
023import org.apache.hadoop.conf.Configuration;
024import org.apache.hadoop.hbase.client.RegionInfoBuilder;
025import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
026import org.apache.hadoop.hbase.util.Threads;
027import org.apache.yetus.audience.InterfaceAudience;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.AdminService;
032import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.ClientService;
033import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.MasterService;
034
035/**
036 * This class defines methods that can help with managing HBase clusters from unit tests and system
037 * tests. There are 3 types of cluster deployments:
038 * <ul>
039 * <li><b>MiniHBaseCluster:</b> each server is run in the same JVM in separate threads, used by unit
040 * tests</li>
041 * <li><b>DistributedHBaseCluster:</b> the cluster is pre-deployed, system and integration tests can
042 * interact with the cluster.</li>
043 * <li><b>ProcessBasedLocalHBaseCluster:</b> each server is deployed locally but in separate JVMs.
044 * </li>
045 * </ul>
046 * <p>
047 * HBaseCluster unifies the way tests interact with the cluster, so that the same test can be run
048 * against a mini-cluster during unit test execution, or a distributed cluster having tens/hundreds
049 * of nodes during execution of integration tests.
050 * <p>
051 * HBaseCluster exposes client-side public interfaces to tests, so that tests does not assume
052 * running in a particular mode. Not all the tests are suitable to be run on an actual cluster, and
053 * some tests will still need to mock stuff and introspect internal state. For those use cases from
054 * unit tests, or if more control is needed, you can use the subclasses directly. In that sense,
055 * this class does not abstract away <strong>every</strong> interface that MiniHBaseCluster or
056 * DistributedHBaseCluster provide.
057 */
058@InterfaceAudience.Public
059public abstract class HBaseCluster implements Closeable, Configurable {
060  // Log is being used in DistributedHBaseCluster class, hence keeping it as package scope
061  static final Logger LOG = LoggerFactory.getLogger(HBaseCluster.class.getName());
062  protected Configuration conf;
063
064  /** the status of the cluster before we begin */
065  protected ClusterMetrics initialClusterStatus;
066
067  /**
068   * Construct an HBaseCluster
069   * @param conf Configuration to be used for cluster
070   */
071  public HBaseCluster(Configuration conf) {
072    setConf(conf);
073  }
074
075  @Override
076  public void setConf(Configuration conf) {
077    this.conf = conf;
078  }
079
080  @Override
081  public Configuration getConf() {
082    return conf;
083  }
084
085  /**
086   * Returns a ClusterMetrics for this HBase cluster.
087   * @see #getInitialClusterMetrics()
088   */
089  public abstract ClusterMetrics getClusterMetrics() throws IOException;
090
091  /**
092   * Returns a ClusterStatus for this HBase cluster as observed at the starting of the HBaseCluster
093   */
094  public ClusterMetrics getInitialClusterMetrics() throws IOException {
095    return initialClusterStatus;
096  }
097
098  /**
099   * Returns an {@link MasterService.BlockingInterface} to the active master
100   */
101  public abstract MasterService.BlockingInterface getMasterAdminService() throws IOException;
102
103  /**
104   * Returns an AdminProtocol interface to the regionserver
105   */
106  public abstract AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
107    throws IOException;
108
109  /**
110   * Returns a ClientProtocol interface to the regionserver
111   */
112  public abstract ClientService.BlockingInterface getClientProtocol(ServerName serverName)
113    throws IOException;
114
115  /**
116   * Starts a new region server on the given hostname or if this is a mini/local cluster, starts a
117   * region server locally.
118   * @param hostname the hostname to start the regionserver on
119   * @throws IOException if something goes wrong
120   */
121  public abstract void startRegionServer(String hostname, int port) throws IOException;
122
123  /**
124   * Kills the region server process if this is a distributed cluster, otherwise this causes the
125   * region server to exit doing basic clean up only.
126   * @throws IOException if something goes wrong
127   */
128  public abstract void killRegionServer(ServerName serverName) throws IOException;
129
130  /**
131   * Keeping track of killed servers and being able to check if a particular server was killed makes
132   * it possible to do fault tolerance testing for dead servers in a deterministic way. A concrete
133   * example of such case is - killing servers and waiting for all regions of a particular table to
134   * be assigned. We can check for server column in META table and that its value is not one of the
135   * killed servers.
136   */
137  public abstract boolean isKilledRS(ServerName serverName);
138
139  /**
140   * Stops the given region server, by attempting a gradual stop.
141   * @throws IOException if something goes wrong
142   */
143  public abstract void stopRegionServer(ServerName serverName) throws IOException;
144
145  /**
146   * Wait for the specified region server to join the cluster
147   * @throws IOException if something goes wrong or timeout occurs
148   */
149  public void waitForRegionServerToStart(String hostname, int port, long timeout)
150    throws IOException {
151    long start = EnvironmentEdgeManager.currentTime();
152    while ((EnvironmentEdgeManager.currentTime() - start) < timeout) {
153      for (ServerName server : getClusterMetrics().getLiveServerMetrics().keySet()) {
154        if (server.getHostname().equals(hostname) && server.getPort() == port) {
155          return;
156        }
157      }
158      Threads.sleep(100);
159    }
160    throw new IOException(
161      "did timeout " + timeout + "ms waiting for region server to start: " + hostname);
162  }
163
164  /**
165   * Wait for the specified region server to stop the thread / process.
166   * @throws IOException if something goes wrong or timeout occurs
167   */
168  public abstract void waitForRegionServerToStop(ServerName serverName, long timeout)
169    throws IOException;
170
171  /**
172   * Suspend the region server
173   * @param serverName the hostname to suspend the regionserver on
174   * @throws IOException if something goes wrong
175   */
176  public abstract void suspendRegionServer(ServerName serverName) throws IOException;
177
178  /**
179   * Wait for the specified region server to suspend the thread / process.
180   * @throws IOException if something goes wrong or timeout occurs
181   */
182  public abstract void waitForRegionServerToSuspend(ServerName serverName, long timeout)
183    throws IOException;
184
185  /**
186   * Resume the region server
187   * @param serverName the hostname to resume the regionserver on
188   * @throws IOException if something goes wrong
189   */
190  public abstract void resumeRegionServer(ServerName serverName) throws IOException;
191
192  /**
193   * Wait for the specified region server to resume the thread / process.
194   * @throws IOException if something goes wrong or timeout occurs
195   */
196  public abstract void waitForRegionServerToResume(ServerName serverName, long timeout)
197    throws IOException;
198
199  /**
200   * Starts a new zookeeper node on the given hostname or if this is a mini/local cluster, silently
201   * logs warning message.
202   * @param hostname the hostname to start the regionserver on
203   * @throws IOException if something goes wrong
204   */
205  public abstract void startZkNode(String hostname, int port) throws IOException;
206
207  /**
208   * Kills the zookeeper node process if this is a distributed cluster, otherwise, this causes
209   * master to exit doing basic clean up only.
210   * @throws IOException if something goes wrong
211   */
212  public abstract void killZkNode(ServerName serverName) throws IOException;
213
214  /**
215   * Stops the region zookeeper if this is a distributed cluster, otherwise silently logs warning
216   * message.
217   * @throws IOException if something goes wrong
218   */
219  public abstract void stopZkNode(ServerName serverName) throws IOException;
220
221  /**
222   * Wait for the specified zookeeper node to join the cluster
223   * @throws IOException if something goes wrong or timeout occurs
224   */
225  public abstract void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException;
226
227  /**
228   * Wait for the specified zookeeper node to stop the thread / process.
229   * @throws IOException if something goes wrong or timeout occurs
230   */
231  public abstract void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException;
232
233  /**
234   * Starts a new datanode on the given hostname or if this is a mini/local cluster, silently logs
235   * warning message.
236   * @throws IOException if something goes wrong
237   */
238  public abstract void startDataNode(ServerName serverName) throws IOException;
239
240  /**
241   * Kills the datanode process if this is a distributed cluster, otherwise, this causes master to
242   * exit doing basic clean up only.
243   * @throws IOException if something goes wrong
244   */
245  public abstract void killDataNode(ServerName serverName) throws IOException;
246
247  /**
248   * Stops the datanode if this is a distributed cluster, otherwise silently logs warning message.
249   * @throws IOException if something goes wrong
250   */
251  public abstract void stopDataNode(ServerName serverName) throws IOException;
252
253  /**
254   * Wait for the specified datanode to join the cluster
255   * @throws IOException if something goes wrong or timeout occurs
256   */
257  public abstract void waitForDataNodeToStart(ServerName serverName, long timeout)
258    throws IOException;
259
260  /**
261   * Wait for the specified datanode to stop the thread / process.
262   * @throws IOException if something goes wrong or timeout occurs
263   */
264  public abstract void waitForDataNodeToStop(ServerName serverName, long timeout)
265    throws IOException;
266
267  /**
268   * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
269   * warning message.
270   * @throws IOException if something goes wrong
271   */
272  public abstract void startNameNode(ServerName serverName) throws IOException;
273
274  /**
275   * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
276   * exit doing basic clean up only.
277   * @throws IOException if something goes wrong
278   */
279  public abstract void killNameNode(ServerName serverName) throws IOException;
280
281  /**
282   * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
283   * @throws IOException if something goes wrong
284   */
285  public abstract void stopNameNode(ServerName serverName) throws IOException;
286
287  /**
288   * Wait for the specified namenode to join the cluster
289   * @throws IOException if something goes wrong or timeout occurs
290   */
291  public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
292    throws IOException;
293
294  /**
295   * Wait for the specified namenode to stop
296   * @throws IOException if something goes wrong or timeout occurs
297   */
298  public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
299    throws IOException;
300
301  /**
302   * Starts a new journalnode on the given hostname or if this is a mini/local cluster, silently
303   * logs warning message.
304   * @throws IOException if something goes wrong
305   */
306  public abstract void startJournalNode(ServerName serverName) throws IOException;
307
308  /**
309   * Kills the journalnode process if this is a distributed cluster, otherwise, this causes master
310   * to exit doing basic clean up only.
311   * @throws IOException if something goes wrong
312   */
313  public abstract void killJournalNode(ServerName serverName) throws IOException;
314
315  /**
316   * Stops the journalnode if this is a distributed cluster, otherwise silently logs warning
317   * message.
318   * @throws IOException if something goes wrong
319   */
320  public abstract void stopJournalNode(ServerName serverName) throws IOException;
321
322  /**
323   * Wait for the specified journalnode to join the cluster
324   * @throws IOException if something goes wrong or timeout occurs
325   */
326  public abstract void waitForJournalNodeToStart(ServerName serverName, long timeout)
327    throws IOException;
328
329  /**
330   * Wait for the specified journalnode to stop
331   * @throws IOException if something goes wrong or timeout occurs
332   */
333  public abstract void waitForJournalNodeToStop(ServerName serverName, long timeout)
334    throws IOException;
335
336  /**
337   * Starts a new master on the given hostname or if this is a mini/local cluster, starts a master
338   * locally.
339   * @param hostname the hostname to start the master on
340   * @throws IOException if something goes wrong
341   */
342  public abstract void startMaster(String hostname, int port) throws IOException;
343
344  /**
345   * Kills the master process if this is a distributed cluster, otherwise, this causes master to
346   * exit doing basic clean up only.
347   * @throws IOException if something goes wrong
348   */
349  public abstract void killMaster(ServerName serverName) throws IOException;
350
351  /**
352   * Stops the given master, by attempting a gradual stop.
353   * @throws IOException if something goes wrong
354   */
355  public abstract void stopMaster(ServerName serverName) throws IOException;
356
357  /**
358   * Wait for the specified master to stop the thread / process.
359   * @throws IOException if something goes wrong or timeout occurs
360   */
361  public abstract void waitForMasterToStop(ServerName serverName, long timeout) throws IOException;
362
363  /**
364   * Blocks until there is an active master and that master has completed initialization.
365   * @return true if an active master becomes available. false if there are no masters left.
366   * @throws IOException if something goes wrong or timeout occurs
367   */
368  public boolean waitForActiveAndReadyMaster() throws IOException {
369    return waitForActiveAndReadyMaster(Long.MAX_VALUE);
370  }
371
372  /**
373   * Blocks until there is an active master and that master has completed initialization.
374   * @param timeout the timeout limit in ms
375   * @return true if an active master becomes available. false if there are no masters left.
376   */
377  public abstract boolean waitForActiveAndReadyMaster(long timeout) throws IOException;
378
379  /**
380   * Wait for HBase Cluster to shut down.
381   */
382  public abstract void waitUntilShutDown() throws IOException;
383
384  /**
385   * Shut down the HBase cluster
386   */
387  public abstract void shutdown() throws IOException;
388
389  /**
390   * Restores the cluster to it's initial state if this is a real cluster, otherwise does nothing.
391   * This is a best effort restore. If the servers are not reachable, or insufficient permissions,
392   * etc. restoration might be partial.
393   * @return whether restoration is complete
394   */
395  public boolean restoreInitialStatus() throws IOException {
396    return restoreClusterMetrics(getInitialClusterMetrics());
397  }
398
399  /**
400   * Restores the cluster to given state if this is a real cluster, otherwise does nothing. This is
401   * a best effort restore. If the servers are not reachable, or insufficient permissions, etc.
402   * restoration might be partial.
403   * @return whether restoration is complete
404   */
405  public boolean restoreClusterMetrics(ClusterMetrics desiredStatus) throws IOException {
406    return true;
407  }
408
409  /**
410   * Get the ServerName of region server serving the first hbase:meta region
411   */
412  public ServerName getServerHoldingMeta() throws IOException {
413    return getServerHoldingRegion(TableName.META_TABLE_NAME,
414      RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName());
415  }
416
417  /**
418   * Get the ServerName of region server serving the specified region
419   * @param regionName Name of the region in bytes
420   * @param tn         Table name that has the region.
421   * @return ServerName that hosts the region or null
422   */
423  public abstract ServerName getServerHoldingRegion(final TableName tn, byte[] regionName)
424    throws IOException;
425
426  /**
427   * @return whether we are interacting with a distributed cluster as opposed to an in-process
428   *         mini/local cluster.
429   */
430  public boolean isDistributedCluster() {
431    return false;
432  }
433
434  /**
435   * Closes all the resources held open for this cluster. Note that this call does not shutdown the
436   * cluster.
437   * @see #shutdown()
438   */
439  @Override
440  public abstract void close() throws IOException;
441
442  /**
443   * Wait for the namenode.
444   */
445  public void waitForNamenodeAvailable() throws InterruptedException {
446  }
447
448  public void waitForDatanodesRegistered(int nbDN) throws Exception {
449  }
450}