diff --git a/cli/api/metaapi.go b/cli/api/metaapi.go index 448b80e4a9..c040d6c351 100644 --- a/cli/api/metaapi.go +++ b/cli/api/metaapi.go @@ -87,6 +87,11 @@ type MetaHttpClient struct { // NewMasterHelper returns a new MasterClient instance. func NewMetaHttpClient(host string, useSSL bool) *MetaHttpClient { mc := &MetaHttpClient{host: host, useSSL: useSSL} + var err error + _, err = log.InitLog("/tmp/cfs", "cli", log.DebugLevel, nil) + if err != nil { + fmt.Printf("init cli log err[%v]", err) + } return mc } @@ -213,6 +218,7 @@ func (mc *MetaHttpClient) GetMetaPartition(pid uint64) (cursor uint64, err error if err != nil { log.LogErrorf("action[GetMetaPartition],pid:%v,err:%v", pid, err) } + log.LogFlush() }() request := newAPIRequest(http.MethodGet, "/getPartitionById") request.params["pid"] = fmt.Sprintf("%v", pid) @@ -236,6 +242,7 @@ func (mc *MetaHttpClient) GetAllDentry(pid uint64) (dentryMap map[string]*metano if err != nil { log.LogErrorf("action[GetAllDentry],pid:%v,err:%v", pid, err) } + log.LogFlush() }() dentryMap = make(map[string]*metanode.Dentry, 0) request := newAPIRequest(http.MethodGet, "/getAllDentry") @@ -286,6 +293,7 @@ func (mc *MetaHttpClient) GetAllInodes(pid uint64) (rstMap map[uint64]*Inode, er if err != nil { log.LogErrorf("action[GetAllInodes],pid:%v,err:%v", pid, err) } + log.LogFlush() }() reqURL := fmt.Sprintf("http://%v%v?pid=%v", mc.host, "/getAllInodes", pid) log.LogDebugf("reqURL=%v", reqURL) diff --git a/cli/build.sh b/cli/build.sh index 0d693d1654..b57530b288 100755 --- a/cli/build.sh +++ b/cli/build.sh @@ -1,22 +1,8 @@ #!/usr/bin/env bash -RootPath=$(cd $(dirname $0)/..; pwd) - -Version=`git describe --abbrev=0 --tags 2>/dev/null` -BranchName=`git rev-parse --abbrev-ref HEAD 2>/dev/null` -CommitID=`git rev-parse HEAD 2>/dev/null` +BranchName=`git rev-parse --abbrev-ref HEAD` +CommitID=`git rev-parse HEAD` BuildTime=`date +%Y-%m-%d\ %H:%M` -SrcPath=${RootPath}/cli -TargetFile=${1:-$RootPath/cli/cfs-cli} - [[ "-$GOPATH" == "-" ]] && { echo "GOPATH not set"; exit 1; } -LDFlags="-X github.com/chubaofs/chubaofs/proto.Version=${Version} \ - -X github.com/chubaofs/chubaofs/proto.CommitID=${CommitID} \ - -X github.com/chubaofs/chubaofs/proto.BranchName=${BranchName} \ - -X 'github.com/chubaofs/chubaofs/proto.BuildTime=${BuildTime}' " - -go build \ - -ldflags "${LDFlags}" \ - -o $TargetFile \ - ${SrcPath}/*.go +go build -ldflags "-X main.CommitID=${CommitID} -X main.BranchName=${BranchName} -X 'main.BuildTime=${BuildTime}'" -o cfs-cli diff --git a/cli/cli.go b/cli/cli.go index 50022ce7e3..c46d76d0b4 100644 --- a/cli/cli.go +++ b/cli/cli.go @@ -16,11 +16,11 @@ package main import ( "fmt" + "os" + "github.com/chubaofs/chubaofs/cli/cmd" "github.com/chubaofs/chubaofs/sdk/master" - "github.com/chubaofs/chubaofs/util/log" "github.com/spf13/cobra" - "os" ) var ( @@ -32,19 +32,17 @@ var ( func runCLI() (err error) { var cfg *cmd.Config if cfg, err = cmd.LoadConfig(); err != nil { - fmt.Printf("init cli log err[%v]", err) return } - cfsCli := setupCommands(cfg) - if err = cfsCli.Execute(); err != nil { - log.LogErrorf("Command fail, err:%v", err) - } + cfscli := setupCommands(cfg) + err = cfscli.Execute() return } func setupCommands(cfg *cmd.Config) *cobra.Command { var mc = master.NewMasterClient(cfg.MasterAddr, false) - mc.SetTimeout(cfg.Timeout) + mc.DataNodeProfPort = cfg.DataNodeProfPort + mc.MetaNodeProfPort = cfg.MetaNodeProfPort cfsRootCmd := cmd.NewRootCmd(mc) var completionCmd = &cobra.Command{ Use: "completion", @@ -77,10 +75,7 @@ following command to execute: func main() { var err error - _, err = log.InitLog("/tmp/cfs", "cli", log.DebugLevel, nil) - defer log.LogFlush() if err = runCLI(); err != nil { - log.LogFlush() _, _ = fmt.Fprintf(os.Stderr, "Error: %v\n", err) os.Exit(1) } diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index dd0447de62..e9890e51a8 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -16,6 +16,7 @@ package cmd import ( "fmt" + "os" "strconv" "github.com/chubaofs/chubaofs/proto" @@ -64,12 +65,14 @@ func newClusterInfoCmd(client *master.MasterClient) *cobra.Command { var cv *proto.ClusterView var delPara map[string]string if cv, err = client.AdminAPI().GetCluster(); err != nil { - errout("Error: %v", err) + errout("Get cluster info fail:\n%v\n", err) + os.Exit(1) } stdout("[Cluster]\n") stdout(formatClusterView(cv)) if delPara, err = client.AdminAPI().GetDeleteParas(); err != nil { - errout("Error: %v", err) + errout("Get delete param fail:\n%v\n", err) + os.Exit(1) } stdout(fmt.Sprintf(" BatchCount : %v\n", delPara[nodeDeleteBatchCountKey])) stdout(fmt.Sprintf(" MarkDeleteRate : %v\n", delPara[nodeMarkDeleteRateKey])) @@ -86,18 +89,11 @@ func newClusterStatCmd(client *master.MasterClient) *cobra.Command { Use: CliOpStatus, Short: cmdClusterStatShort, Run: func(cmd *cobra.Command, args []string) { - var ( - err error - cs *proto.ClusterStatInfo - ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() + var err error + var cs *proto.ClusterStatInfo if cs, err = client.AdminAPI().GetClusterStat(); err != nil { - err = fmt.Errorf("Get cluster info fail:\n%v\n", err) - return + errout("Get cluster info fail:\n%v\n", err) + os.Exit(1) } stdout("[Cluster Status]\n") stdout(formatClusterStat(cs)) @@ -109,10 +105,10 @@ func newClusterStatCmd(client *master.MasterClient) *cobra.Command { func newClusterFreezeCmd(client *master.MasterClient) *cobra.Command { var cmd = &cobra.Command{ - Use: CliOpFreeze + " [ENABLE]", + Use: CliOpFreeze + " [ENABLE]", ValidArgs: []string{"true", "false"}, - Short: cmdClusterFreezeShort, - Args: cobra.MinimumNArgs(1), + Short: cmdClusterFreezeShort, + Args: cobra.MinimumNArgs(1), Long: `Turn on or off the automatic allocation of the data partitions. If 'freeze=false', ChubaoFS WILL automatically allocate new data partitions for the volume when: 1. the used space is below the max capacity, @@ -120,21 +116,15 @@ If 'freeze=false', ChubaoFS WILL automatically allocate new data partitions for If 'freeze=true', ChubaoFS WILL NOT automatically allocate new data partitions `, Run: func(cmd *cobra.Command, args []string) { - var ( - err error - enable bool - ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() + var err error + var enable bool if enable, err = strconv.ParseBool(args[0]); err != nil { - err = fmt.Errorf("Parse bool fail: %v\n", err) - return + errout("Parse bool fail: %v\n", err) + os.Exit(1) } if err = client.AdminAPI().IsFreezeCluster(enable); err != nil { - return + errout("Failed: %v\n", err) + os.Exit(1) } if enable { stdout("Freeze cluster successful!\n") @@ -154,25 +144,19 @@ func newClusterSetThresholdCmd(client *master.MasterClient) *cobra.Command { Long: `Set the threshold of memory on each meta node. If the memory usage reaches this threshold, all the mata partition will be readOnly.`, Run: func(cmd *cobra.Command, args []string) { - var ( - err error - threshold float64 - ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() + var err error + var threshold float64 if threshold, err = strconv.ParseFloat(args[0], 64); err != nil { - err = fmt.Errorf("Parse Float fail: %v\n", err) - return + errout("Parse Float fail: %v\n", err) + os.Exit(1) } if threshold > 1.0 { - err = fmt.Errorf("Threshold too big\n") - return + errout("Threshold too big\n") + os.Exit(1) } if err = client.AdminAPI().SetMetaNodeThreshold(threshold); err != nil { - return + errout("Failed: %v\n", err) + os.Exit(1) } stdout("MetaNode threshold is set to %v!\n", threshold) }, diff --git a/cli/cmd/compatibility.go b/cli/cmd/compatibility.go index b1e402c6c8..9966f92f1c 100644 --- a/cli/cmd/compatibility.go +++ b/cli/cmd/compatibility.go @@ -15,13 +15,15 @@ package cmd import ( - "fmt" + "os" "github.com/chubaofs/chubaofs/cli/api" + "github.com/spf13/cobra" "github.com/chubaofs/chubaofs/metanode" + "fmt" + "strconv" + "github.com/chubaofs/chubaofs/util/log" "github.com/chubaofs/chubaofs/proto" - "github.com/spf13/cobra" "reflect" - "strconv" ) const ( @@ -53,8 +55,8 @@ func newMetaCompatibilityCmd() *cobra.Command { Aliases: []string{"meta"}, Args: cobra.MinimumNArgs(3), Run: func(cmd *cobra.Command, args []string) { + var err error var ( - err error snapshotPath = args[0] host = args[1] pid = args[2] @@ -62,12 +64,15 @@ func newMetaCompatibilityCmd() *cobra.Command { client := api.NewMetaHttpClient(host, false) defer func() { if err != nil { - errout("Error: %v", err) + errout("Verify metadata consistency failed: %v\n", err) + log.LogError(err) + log.LogFlush() + os.Exit(1) } }() id, err := strconv.ParseUint(pid, 10, 64) if err != nil { - err = fmt.Errorf("parse pid[%v] failed: %v\n", pid, err) + errout("parse pid[%v] failed: %v\n", pid, err) return } cursor, err := client.GetMetaPartition(id) @@ -85,9 +90,11 @@ func newMetaCompatibilityCmd() *cobra.Command { } stdout("[Meta partition is %v, verify result]\n", id) if err = verifyDentry(client, mp); err != nil { + stdout("%v\n", err) return } if err = verifyInode(client, mp); err != nil { + stdout("%v\n", err) return } stdout("All meta has checked\n") @@ -104,27 +111,25 @@ func verifyDentry(client *api.MetaHttpClient, mp metanode.MetaPartition) (err er mp.GetDentryTree().Ascend(func(d metanode.BtreeItem) bool { dentry, ok := d.(*metanode.Dentry) if !ok { - stdout("item type is not *metanode.Dentry \n") + stdout("item type is not *metanode.Dentry") err = fmt.Errorf("item type is not *metanode.Dentry") - return true + return false } key := fmt.Sprintf("%v_%v", dentry.ParentId, dentry.Name) oldDentry, ok := dentryMap[key] if !ok { - stdout("dentry %v is not in old version \n", key) + stdout("dentry %v is not in old version", key) err = fmt.Errorf("dentry %v is not in old version", key) return false } if !reflect.DeepEqual(dentry, oldDentry) { - stdout("dentry %v is not equal with old version \n", key) + stdout("dentry %v is not equal with old version", key) err = fmt.Errorf("dentry %v is not equal with old version,dentry[%v],oldDentry[%v]", key, dentry, oldDentry) return false } return true }) - if err == nil { - stdout("The number of dentry is %v, all dentry are consistent \n", mp.GetDentryTree().Len()) - } + stdout("The number of dentry is %v, all dentry are consistent \n", mp.GetDentryTree().Len()) return } @@ -137,15 +142,12 @@ func verifyInode(client *api.MetaHttpClient, mp metanode.MetaPartition) (err err mp.GetInodeTree().Ascend(func(d metanode.BtreeItem) bool { inode, ok := d.(*metanode.Inode) if !ok { - stdout("item type is not *metanode.Inode \n") - err = fmt.Errorf("item type is not *metanode.Inode") return true } oldInode, ok := inodesMap[inode.Inode] if !ok { stdout("inode %v is not in old version \n", inode.Inode) - err = fmt.Errorf("inode %v is not in old version", inode.Inode) - return false + return true } localInode = &api.Inode{ Inode: inode.Inode, @@ -169,13 +171,9 @@ func verifyInode(client *api.MetaHttpClient, mp metanode.MetaPartition) (err err }) if !reflect.DeepEqual(oldInode, localInode) { stdout("inode %v is not equal with old version,inode[%v],oldInode[%v]\n", inode.Inode, inode, oldInode) - err = fmt.Errorf("inode %v is not equal with old version,inode[%v],oldInode[%v]\n", inode.Inode, inode, oldInode) - return false } return true }) - if err == nil { - stdout("The number of inodes is %v, all inodes are consistent \n", mp.GetInodeTree().Len()) - } + stdout("The number of inodes is %v, all inodes are consistent \n", mp.GetInodeTree().Len()) return } diff --git a/cli/cmd/config.go b/cli/cmd/config.go index 3e919c2d01..62c03bbc68 100644 --- a/cli/cmd/config.go +++ b/cli/cmd/config.go @@ -37,6 +37,8 @@ var ( "masterAddr": [ "master.chubao.io" ], + "dnProf": 17320, + "mnProf": 17220, "timeout": 60 } `) @@ -44,8 +46,10 @@ var ( ) type Config struct { - MasterAddr []string `json:"masterAddr"` - Timeout uint16 `json:"timeout"` + MasterAddr []string `json:"masterAddr"` + DataNodeProfPort uint16 `json:"dnProf"` + MetaNodeProfPort uint16 `json:"mnProf"` + Timeout uint16 `json:"timeout"` } func newConfigCmd() *cobra.Command { @@ -65,35 +69,50 @@ const ( func newConfigSetCmd() *cobra.Command { var optMasterHost string + var optDNProfPort uint16 + var optMNProfPort uint16 var optTimeout uint16 var cmd = &cobra.Command{ Use: CliOpSet, Short: cmdConfigSetShort, Long: `Set the config file`, Run: func(cmd *cobra.Command, args []string) { - var ( - err error - masterHosts []string - ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() - if optMasterHost == "" && optTimeout == 0 { - stdout(fmt.Sprintf("No change. Input 'cfs-cli config set -h' for help.\n")) + var masterHosts []string + var config *Config + var err error + if optMasterHost == "" && optDNProfPort == 0 && optMNProfPort == 0 { + stdout(fmt.Sprintf("No changes has been set. Input 'cfs-cli config set -h' for help.\n")) return } if len(optMasterHost) != 0 { masterHosts = append(masterHosts, optMasterHost) } - if err = setConfig(masterHosts, optTimeout); err != nil { + if config, err = LoadConfig(); err != nil { + stdout("load config file failed") + return + } + if len(masterHosts) > 0 { + config.MasterAddr = masterHosts + } + if optDNProfPort > 0 { + config.DataNodeProfPort = optDNProfPort + } + if optMNProfPort > 0 { + config.MetaNodeProfPort = optMNProfPort + } + if optTimeout > 0 { + config.Timeout = optTimeout + } + if _, err := setConfig(config); err != nil { + stdout("error: %v\n", err) return } stdout(fmt.Sprintf("Config has been set successfully!\n")) }, } cmd.Flags().StringVar(&optMasterHost, "addr", "", "Specify master address [{HOST}:{PORT}]") + cmd.Flags().Uint16Var(&optDNProfPort, "dnProf", 0, "Specify prof port for DataNode") + cmd.Flags().Uint16Var(&optMNProfPort, "mnProf", 0, "Specify prof port for DataNode") cmd.Flags().Uint16Var(&optTimeout, "timeout", 0, "Specify timeout for requests [Unit: s]") return cmd } @@ -107,9 +126,10 @@ func newConfigInfoCmd() *cobra.Command { config, err := LoadConfig() if err != nil { _, _ = fmt.Fprintf(os.Stderr, "Error: %v\n", err) - OsExitWithLogFlush() + os.Exit(1) } - printConfigInfo(config) + stdout(fmt.Sprintf("Config info:\n %v\n", config.MasterAddr)) + }, } cmd.Flags().StringVar(&optFilterWritable, "filter-writable", "", "Filter node writable status") @@ -117,33 +137,17 @@ func newConfigInfoCmd() *cobra.Command { return cmd } -func printConfigInfo(config *Config) { - stdout("Config info:\n") - stdout(" Master Address : %v\n", config.MasterAddr) - stdout(" Request Timeout [s]: %v\n", config.Timeout) -} - -func setConfig(masterHosts []string, timeout uint16) (err error) { - var config *Config - if config, err = LoadConfig(); err != nil { - return - } - if len(masterHosts) > 0 { - config.MasterAddr = masterHosts - } - if timeout != 0 { - config.Timeout = timeout - } +func setConfig(config *Config) (*Config, error) { + var err error var configData []byte if configData, err = json.Marshal(config); err != nil { - return + return nil, err } if err = ioutil.WriteFile(defaultConfigPath, configData, 0600); err != nil { - return + return nil, err } - return nil + return config, nil } - func LoadConfig() (*Config, error) { var err error var configData []byte @@ -160,8 +164,5 @@ func LoadConfig() (*Config, error) { if err = json.Unmarshal(configData, config); err != nil { return nil, err } - if config.Timeout == 0 { - config.Timeout = defaultConfigTimeout - } return config, nil } diff --git a/cli/cmd/const.go b/cli/cmd/const.go index c96d0c3758..e46b1cfdff 100644 --- a/cli/cmd/const.go +++ b/cli/cmd/const.go @@ -34,8 +34,8 @@ const ( CliOpReset = "reset" CliOpReplicate = "add-replica" CliOpDelReplica = "del-replica" - CliOpExpand = "expand" - CliOpShrink = "shrink" + CliOpExpand = "expand" + CliOpShrink = "shrink" //Shorthand format of operation name CliOpDecommissionShortHand = "dec" @@ -59,8 +59,10 @@ const ( CliFlagReplicas = "replicas" CliFlagEnable = "enable" CliFlagEnableFollowerRead = "follower-read" + CliFlagAutoRepair = "auto-repair" CliFlagAuthenticate = "authenticate" CliFlagEnableToken = "enable-token" + CliFlagEnableAutoFill = "auto-fill" CliFlagCapacity = "capacity" CliFlagThreshold = "threshold" CliFlagAddress = "addr" @@ -82,10 +84,3 @@ const ( ResourceDataPartitionShortHand = "dp" ResourceMetaPartitionShortHand = "mp" ) -type MasterOp int -const ( - OpExpandVol MasterOp = iota - OpShrinkVol - OpCreateVol - OpDeleteVol -) diff --git a/cli/cmd/datanode.go b/cli/cmd/datanode.go index 05d5e5a27e..013b8ed401 100644 --- a/cli/cmd/datanode.go +++ b/cli/cmd/datanode.go @@ -15,6 +15,7 @@ package cmd import ( + "os" "sort" "strings" @@ -57,7 +58,8 @@ func newDataNodeListCmd(client *master.MasterClient) *cobra.Command { var err error defer func() { if err != nil { - errout("Error: %v", err) + errout("List cluster data nodes failed: %v\n", err) + os.Exit(1) } }() var view *proto.ClusterView @@ -98,7 +100,8 @@ func newDataNodeInfoCmd(client *master.MasterClient) *cobra.Command { var datanodeInfo *proto.DataNodeInfo defer func() { if err != nil { - errout("Error: %v", err) + errout("Show data node info failed: %v\n", err) + os.Exit(1) } }() nodeAddr = args[0] @@ -129,7 +132,8 @@ func newDataNodeDecommissionCmd(client *master.MasterClient) *cobra.Command { var nodeAddr string defer func() { if err != nil { - errout("Error: %v", err) + errout("decommission data node failed, err[%v]\n", err) + os.Exit(1) } }() nodeAddr = args[0] diff --git a/cli/cmd/datapartition.go b/cli/cmd/datapartition.go index b5a0ef31cd..248c328f30 100644 --- a/cli/cmd/datapartition.go +++ b/cli/cmd/datapartition.go @@ -21,6 +21,9 @@ import ( "github.com/spf13/cobra" "sort" "strconv" + "strings" + "sync" + "time" ) const ( @@ -44,12 +47,12 @@ func newDataPartitionCmd(client *master.MasterClient) *cobra.Command { } const ( - cmdDataPartitionGetShort = "Display detail information of a data partition" - cmdCheckCorruptDataPartitionShort = "Check and list unhealthy data partitions" - cmdDataPartitionDecommissionShort = "Decommission a replication of the data partition to a new address" - cmdDataPartitionReplicateShort = "Add a replication of the data partition on a new address" - cmdDataPartitionDeleteReplicaShort = "Delete a replication of the data partition on a fixed address" - ) + cmdDataPartitionGetShort = "Display detail information of a data partition" + cmdCheckCorruptDataPartitionShort = "Check out corrupt data partitions" + cmdDataPartitionDecommissionShort = "Decommission a replication of the data partition to a new address" + cmdDataPartitionReplicateShort = "Add a replication of the data partition on a new address" + cmdDataPartitionDeleteReplicaShort = "Delete a replication of the data partition on a fixed address" +) func newDataPartitionGetCmd(client *master.MasterClient) *cobra.Command { var cmd = &cobra.Command{ @@ -58,16 +61,10 @@ func newDataPartitionGetCmd(client *master.MasterClient) *cobra.Command { Args: cobra.MinimumNArgs(1), Run: func(cmd *cobra.Command, args []string) { var ( - err error - partitionID uint64 - partition *proto.DataPartitionInfo + partition *proto.DataPartitionInfo ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() - if partitionID, err = strconv.ParseUint(args[0], 10, 64); err != nil { + partitionID, err := strconv.ParseUint(args[0], 10, 64) + if err != nil { return } if partition, err = client.AdminAPI().GetDataPartition("", partitionID); err != nil { @@ -80,6 +77,8 @@ func newDataPartitionGetCmd(client *master.MasterClient) *cobra.Command { } func newListCorruptDataPartitionCmd(client *master.MasterClient) *cobra.Command { + var optEnableAutoFullfill bool + var optCheckAll bool var cmd = &cobra.Command{ Use: CliOpCheck, Short: cmdCheckCorruptDataPartitionShort, @@ -91,25 +90,26 @@ you can use the "reset" command to fix the problem.The "reset" command may lead The "reset" command will be released in next version`, Run: func(cmd *cobra.Command, args []string) { var ( - diagnosis *proto.DataPartitionDiagnosis - dataNodes []*proto.DataNodeInfo - err error + diagnosis *proto.DataPartitionDiagnosis + dataNodes []*proto.DataNodeInfo + err error ) - defer func() { + if optCheckAll { + err = checkAllDataPartitions(client) if err != nil { - errout("Error: %v", err) + stdout("%v\n", err) } - }() + return + } if diagnosis, err = client.AdminAPI().DiagnoseDataPartition(); err != nil { + stdout("%v\n", err) return } stdout("[Inactive Data nodes]:\n") stdout("%v\n", formatDataNodeDetailTableHeader()) for _, addr := range diagnosis.InactiveDataNodes { var node *proto.DataNodeInfo - if node, err = client.NodeAPI().GetDataNode(addr); err != nil { - return - } + node, err = client.NodeAPI().GetDataNode(addr) dataNodes = append(dataNodes, node) } sort.SliceStable(dataNodes, func(i, j int) bool { @@ -118,7 +118,7 @@ The "reset" command will be released in next version`, for _, node := range dataNodes { stdout("%v\n", formatDataNodeDetail(node, true)) } - stdout("\n") + /*stdout("\n") stdout("[Corrupt data partitions](no leader):\n") stdout("%v\n", partitionInfoTableHeader) sort.SliceStable(diagnosis.CorruptDataPartitionIDs, func(i, j int) bool { @@ -127,11 +127,11 @@ The "reset" command will be released in next version`, for _, pid := range diagnosis.CorruptDataPartitionIDs { var partition *proto.DataPartitionInfo if partition, err = client.AdminAPI().GetDataPartition("", pid); err != nil { - err = fmt.Errorf("Partition not found, err:[%v] ", err) + stdout("Partition not found, err:[%v]", err) return } stdout("%v\n", formatDataPartitionInfoRow(partition)) - } + }*/ stdout("\n") stdout("%v\n", "[Partition lack replicas]:") @@ -139,57 +139,206 @@ The "reset" command will be released in next version`, sort.SliceStable(diagnosis.LackReplicaDataPartitionIDs, func(i, j int) bool { return diagnosis.LackReplicaDataPartitionIDs[i] < diagnosis.LackReplicaDataPartitionIDs[j] }) + cv, _ := client.AdminAPI().GetCluster() + dns := cv.DataNodes + var sb = strings.Builder{} + for _, pid := range diagnosis.LackReplicaDataPartitionIDs { var partition *proto.DataPartitionInfo if partition, err = client.AdminAPI().GetDataPartition("", pid); err != nil { - err = fmt.Errorf("Partition not found, err:[%v] ", err) + stdout("Partition is not found, err:[%v]", err) return } if partition != nil { stdout("%v\n", formatDataPartitionInfoRow(partition)) - } - } + sort.Strings(partition.Hosts) + if len(partition.MissingNodes) > 0 || partition.Status == -1 { + stdoutRed(fmt.Sprintf("partition not ready to repair")) + continue + } + var leaderRps map[uint64]*proto.ReplicaStatus + var canAutoRepair bool + var peerStrings []string + canAutoRepair = true + for i, r := range partition.Replicas { + var rps map[uint64]*proto.ReplicaStatus + var dnPartition *proto.DNDataPartitionInfo + var err error + addr := strings.Split(r.Addr, ":")[0] + if dnPartition, err = client.NodeAPI().DataNodeGetPartition(addr, partition.PartitionID); err != nil { + fmt.Printf(partitionInfoColorTablePattern+"\n", + "", "", "", fmt.Sprintf("%v(hosts)", r.Addr), fmt.Sprintf("%v/%v", "nil", partition.ReplicaNum), "get partition info failed") + continue + } + sort.Strings(dnPartition.Replicas) + fmt.Printf(partitionInfoColorTablePattern+"\n", + "", "", "", fmt.Sprintf("%v(hosts)", r.Addr), fmt.Sprintf("%v/%v", len(dnPartition.Replicas), partition.ReplicaNum), strings.Join(dnPartition.Replicas, "; ")) - - stdout("\n") - stdout("%v\n", "[Bad data partitions(decommission not completed)]:") - badPartitionTablePattern := "%-8v %-10v\n" - stdout(badPartitionTablePattern, "PATH", "PARTITION ID") - for _, bdpv := range diagnosis.BadDataPartitionIDs { - sort.SliceStable(bdpv.PartitionIDs, func(i, j int) bool { - return bdpv.PartitionIDs[i] < bdpv.PartitionIDs[j] - }) - for _, pid := range bdpv.PartitionIDs { - stdout(badPartitionTablePattern, bdpv.Path, pid) + if rps = dnPartition.RaftStatus.Replicas; rps != nil { + leaderRps = rps + } + peers := convertPeersToArray(dnPartition.Peers) + sort.Strings(peers) + if i == 0 { + peerStrings = peers + } else { + if !isEqualStrings(peers, peerStrings) { + canAutoRepair = false + } + } + fmt.Printf(partitionInfoColorTablePattern+"\n", + "", "", "", fmt.Sprintf("%v(peers)", r.Addr), fmt.Sprintf("%v/%v", len(peers), partition.ReplicaNum), strings.Join(peers, "; ")) + } + if len(leaderRps) != 3 || len(partition.Hosts) != 2 { + stdoutRed(fmt.Sprintf("raft peer number(expected is 3, but is %v) or replica number(expected is 2, but is %v) not match ", len(leaderRps), len(partition.Hosts))) + continue + } + var lackAddr []string + for _, dn := range dns { + if _, ok := leaderRps[dn.ID]; ok { + if !contains(partition.Hosts, dn.Addr) { + lackAddr = append(lackAddr, dn.Addr) + } + } + } + if len(lackAddr) != 1 { + stdoutRed(fmt.Sprintf("Not classic partition, please check and repair it manually")) + continue + } + stdoutGreen(fmt.Sprintf(" The Lack Address is: %v", lackAddr)) + if canAutoRepair { + sb.WriteString(fmt.Sprintf("cfs-cli datapartition add-replica %v %v\n", lackAddr[0], partition.PartitionID)) + } + if optEnableAutoFullfill && canAutoRepair { + stdoutGreen(" Auto Repair Begin:") + if err = client.AdminAPI().AddDataReplica(partition.PartitionID, lackAddr[0]); err != nil { + stdoutRed(fmt.Sprintf("%v err:%v", " Failed.", err)) + continue + } + stdoutGreen(" Done.") + time.Sleep(2 * time.Second) + } + stdoutGreen(strings.Repeat("_ ", len(partitionInfoTableHeader)/2+20) + "\n") } } + if !optEnableAutoFullfill { + stdout(sb.String()) + } return }, } + cmd.Flags().BoolVar(&optEnableAutoFullfill, CliFlagEnableAutoFill, false, "true - automatically full fill the missing replica") + cmd.Flags().BoolVar(&optCheckAll, "all", false, "true - check all partitions; false - only check partitions which lack of replica") return cmd } +func checkAllDataPartitions(client *master.MasterClient) (err error) { + var volInfo []*proto.VolInfo + if volInfo, err = client.AdminAPI().ListVols(""); err != nil { + stdout("%v\n", err) + return + } + stdout("\n") + stdout("%v\n", "[Partition peer info not valid]:") + stdout("%v\n", partitionInfoTableHeader) + for _, vol := range volInfo { + var volView *proto.VolView + if volView, err = client.ClientAPI().GetVolume(vol.Name, calcAuthKey(vol.Owner)); err != nil { + stdout("Found an invalid vol: %v\n", vol.Name) + continue + } + sort.SliceStable(volView.DataPartitions, func(i, j int) bool { + return volView.DataPartitions[i].PartitionID < volView.DataPartitions[j].PartitionID + }) + var wg sync.WaitGroup + for _, dp := range volView.DataPartitions { + wg.Add(1) + go func(dp *proto.DataPartitionResponse) { + defer wg.Done() + var outPut string + var isHealthy bool + outPut, isHealthy, _ = checkDataPartition(dp.PartitionID, client) + if !isHealthy { + fmt.Printf(outPut) + stdoutGreen(strings.Repeat("_ ", len(partitionInfoTableHeader)/2+20) + "\n") + } + }(dp) + } + wg.Wait() + } + return +} +func checkDataPartition(pid uint64, client *master.MasterClient) (outPut string, isHealthy bool, err error) { + var partition *proto.DataPartitionInfo + var sb = strings.Builder{} + isHealthy = true + if partition, err = client.AdminAPI().GetDataPartition("", pid); err != nil { + sb.WriteString(fmt.Sprintf("Partition is not found, err:[%v]", err)) + return + } + if partition != nil { + sb.WriteString(fmt.Sprintf("%v\n", formatDataPartitionInfoRow(partition))) + sort.Strings(partition.Hosts) + if len(partition.MissingNodes) > 0 || partition.Status == -1 || len(partition.Hosts) != int(partition.ReplicaNum) { + errMsg := fmt.Sprintf("The partition is not healthy according to the report message from master") + sb.WriteString(fmt.Sprintf("\033[1;40;31m%-8v\033[0m\n", errMsg)) + isHealthy = false + } + var leaderRps map[uint64]*proto.ReplicaStatus + for _, r := range partition.Replicas { + var rps map[uint64]*proto.ReplicaStatus + var dnPartition *proto.DNDataPartitionInfo + var err error + addr := strings.Split(r.Addr, ":")[0] + if dnPartition, err = client.NodeAPI().DataNodeGetPartition(addr, partition.PartitionID); err != nil { + sb.WriteString(fmt.Sprintf(partitionInfoColorTablePattern+"\n", + "", "", "", fmt.Sprintf("%v", r.Addr), fmt.Sprintf("%v/%v", "nil", partition.ReplicaNum), fmt.Sprintf("get partition info failed, err:%v", err))) + isHealthy = false + continue + } + sort.Strings(dnPartition.Replicas) + sb.WriteString(fmt.Sprintf(partitionInfoColorTablePattern+"\n", + "", "", "", fmt.Sprintf("%v(hosts)", r.Addr), fmt.Sprintf("%v/%v", len(dnPartition.Replicas), partition.ReplicaNum), strings.Join(dnPartition.Replicas, "; "))) + if rps = dnPartition.RaftStatus.Replicas; rps != nil { + leaderRps = rps + } + peerStrings := convertPeersToArray(dnPartition.Peers) + sort.Strings(peerStrings) + sb.WriteString(fmt.Sprintf(partitionInfoColorTablePattern+"\n", + "", "", "", fmt.Sprintf("%v(peers)", r.Addr), fmt.Sprintf("%v/%v", len(peerStrings), partition.ReplicaNum), strings.Join(peerStrings, "; "))) + if !isEqualStrings(peerStrings, dnPartition.Replicas) { + isHealthy = false + } + if !isEqualStrings(partition.Hosts, peerStrings) { + isHealthy = false + } + if len(peerStrings) != int(partition.ReplicaNum) || len(dnPartition.Replicas) != int(partition.ReplicaNum) { + isHealthy = false + } + } + if len(leaderRps) == 0 { + isHealthy = false + errMsg := fmt.Sprintf("no raft leader") + sb.WriteString(fmt.Sprintf("\033[1;40;31m%-8v\033[0m\n", errMsg)) + } + } + outPut = sb.String() + return +} func newDataPartitionDecommissionCmd(client *master.MasterClient) *cobra.Command { var cmd = &cobra.Command{ Use: CliOpDecommission + " [ADDRESS] [DATA PARTITION ID]", Short: cmdDataPartitionDecommissionShort, Args: cobra.MinimumNArgs(2), Run: func(cmd *cobra.Command, args []string) { - var ( - err error - partitionID uint64 - ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() address := args[0] - partitionID, err = strconv.ParseUint(args[1], 10, 64) + partitionID, err := strconv.ParseUint(args[1], 10, 64) if err != nil { + stdout("%v\n", err) return } if err = client.AdminAPI().DecommissionDataPartition(partitionID, address); err != nil { + stdout("%v\n", err) return } }, @@ -209,20 +358,14 @@ func newDataPartitionReplicateCmd(client *master.MasterClient) *cobra.Command { Short: cmdDataPartitionReplicateShort, Args: cobra.MinimumNArgs(2), Run: func(cmd *cobra.Command, args []string) { - var ( - err error - partitionID uint64 - ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() address := args[0] - if partitionID, err = strconv.ParseUint(args[1], 10, 64); err != nil { + partitionID, err := strconv.ParseUint(args[1], 10, 64) + if err != nil { + stdout("%v\n", err) return } if err = client.AdminAPI().AddDataReplica(partitionID, address); err != nil { + stdout("%v\n", err) return } }, @@ -242,20 +385,14 @@ func newDataPartitionDeleteReplicaCmd(client *master.MasterClient) *cobra.Comman Short: cmdDataPartitionDeleteReplicaShort, Args: cobra.MinimumNArgs(2), Run: func(cmd *cobra.Command, args []string) { - var ( - err error - partitionID uint64 - ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() address := args[0] - if partitionID, err = strconv.ParseUint(args[1], 10, 64); err != nil { + partitionID, err := strconv.ParseUint(args[1], 10, 64) + if err != nil { + stdout("%v\n", err) return } if err = client.AdminAPI().DeleteDataReplica(partitionID, address); err != nil { + stdout("%v\n", err) return } }, diff --git a/cli/cmd/fmt.go b/cli/cmd/fmt.go index caafb86600..4fa6e6bedc 100644 --- a/cli/cmd/fmt.go +++ b/cli/cmd/fmt.go @@ -17,6 +17,7 @@ package cmd import ( "fmt" "math" + "sort" "strconv" "strings" "time" @@ -26,16 +27,18 @@ import ( func formatClusterView(cv *proto.ClusterView) string { var sb = strings.Builder{} - sb.WriteString(fmt.Sprintf(" Cluster name : %v\n", cv.Name)) - sb.WriteString(fmt.Sprintf(" Master leader : %v\n", cv.LeaderAddr)) - sb.WriteString(fmt.Sprintf(" Auto allocate : %v\n", formatEnabledDisabled(!cv.DisableAutoAlloc))) - sb.WriteString(fmt.Sprintf(" MetaNode count : %v\n", len(cv.MetaNodes))) - sb.WriteString(fmt.Sprintf(" MetaNode used : %v GB\n", cv.MetaNodeStatInfo.UsedGB)) - sb.WriteString(fmt.Sprintf(" MetaNode total : %v GB\n", cv.MetaNodeStatInfo.TotalGB)) - sb.WriteString(fmt.Sprintf(" DataNode count : %v\n", len(cv.DataNodes))) - sb.WriteString(fmt.Sprintf(" DataNode used : %v GB\n", cv.DataNodeStatInfo.UsedGB)) - sb.WriteString(fmt.Sprintf(" DataNode total : %v GB\n", cv.DataNodeStatInfo.TotalGB)) - sb.WriteString(fmt.Sprintf(" Volume count : %v\n", len(cv.VolStatInfo))) + sb.WriteString(fmt.Sprintf(" Cluster name : %v\n", cv.Name)) + sb.WriteString(fmt.Sprintf(" Master leader : %v\n", cv.LeaderAddr)) + sb.WriteString(fmt.Sprintf(" Auto allocate : %v\n", formatEnabledDisabled(!cv.DisableAutoAlloc))) + sb.WriteString(fmt.Sprintf(" MetaNode count : %v\n", len(cv.MetaNodes))) + sb.WriteString(fmt.Sprintf(" MetaNode used : %v GB\n", cv.MetaNodeStatInfo.UsedGB)) + sb.WriteString(fmt.Sprintf(" MetaNode total : %v GB\n", cv.MetaNodeStatInfo.TotalGB)) + sb.WriteString(fmt.Sprintf(" DataNode count : %v\n", len(cv.DataNodes))) + sb.WriteString(fmt.Sprintf(" DataNode used : %v GB\n", cv.DataNodeStatInfo.UsedGB)) + sb.WriteString(fmt.Sprintf(" DataNode total : %v GB\n", cv.DataNodeStatInfo.TotalGB)) + sb.WriteString(fmt.Sprintf(" Volume count : %v\n", len(cv.VolStatInfo))) + sb.WriteString(fmt.Sprintf(" Dp recover pool : %v\n", cv.DpRecoverPool)) + sb.WriteString(fmt.Sprintf(" Mp recover pool : %v\n", cv.MpRecoverPool)) return sb.String() } @@ -99,8 +102,8 @@ func formatSimpleVolView(svv *proto.SimpleVolView) string { sb.WriteString(fmt.Sprintf(" Create time : %v\n", svv.CreateTime)) sb.WriteString(fmt.Sprintf(" Authenticate : %v\n", formatEnabledDisabled(svv.Authenticate))) sb.WriteString(fmt.Sprintf(" Follower read : %v\n", formatEnabledDisabled(svv.FollowerRead))) - sb.WriteString(fmt.Sprintf(" Enable token : %v\n", formatEnabledDisabled(svv.EnableToken))) sb.WriteString(fmt.Sprintf(" Cross zone : %v\n", formatEnabledDisabled(svv.CrossZone))) + sb.WriteString(fmt.Sprintf(" Auto repair : %v\n", formatEnabledDisabled(svv.AutoRepair))) sb.WriteString(fmt.Sprintf(" Inode count : %v\n", svv.InodeCount)) sb.WriteString(fmt.Sprintf(" Dentry count : %v\n", svv.DentryCount)) sb.WriteString(fmt.Sprintf(" Max metaPartition ID : %v\n", svv.MaxMetaPartitionID)) @@ -133,6 +136,17 @@ func formatVolInfoTableRow(vi *proto.VolInfo) string { formatVolumeStatus(vi.Status), time.Unix(vi.CreateTime, 0).Local().Format(time.RFC1123)) } +var ( + volumeDetailInfoTablePattern = "%-63v %-20v %-30v %-10v %-12v %-8v %-8v %-8v %-8v %-10v" + volumeDetailInfoTableHeader = fmt.Sprintf(volumeDetailInfoTablePattern, "VOLUME", "OWNER", "ZONE NAME", "CROSS ZONE", "INODE COUNT", "DP COUNT", "USED", "TOTAL", "STATUS", "CREATE TIME") +) + +func formatVolDetailInfoTableRow(vv *proto.SimpleVolView, vi *proto.VolInfo) string { + return fmt.Sprintf(volumeDetailInfoTablePattern, + vv.Name, vv.Owner, vv.ZoneName, vv.CrossZone, vv.InodeCount, vv.DpCnt, formatSize(vi.UsedSize), formatSize(vi.TotalSize), + formatVolumeStatus(vi.Status), time.Unix(vi.CreateTime, 0).Local().Format(time.RFC1123)) +} + var ( dataPartitionTablePattern = "%-8v %-8v %-10v %-10v %-18v %-18v" dataPartitionTableHeader = fmt.Sprintf(dataPartitionTablePattern, @@ -146,19 +160,25 @@ func formatDataPartitionTableRow(view *proto.DataPartitionResponse) string { } var ( - partitionInfoTablePattern = "%-8v %-8v %-10v %-18v %-18v" - partitionInfoTableHeader = fmt.Sprintf(partitionInfoTablePattern, - "ID", "VOLUME", "REPLICAS", "STATUS", "MEMBERS") + partitionInfoTablePattern = "%-8v %-25v %-10v %-28v %-10v %-18v" + partitionInfoColorTablePattern = "%-8v %-25v %-10v %-28v \033[1;40;32m%-10v\033[0m %-18v" + partitionInfoTableHeader = fmt.Sprintf(partitionInfoTablePattern, + "ID", "VOLUME", "STATUS", "POSITION", "REPLICANUM", "HOSTS") ) func formatDataPartitionInfoRow(partition *proto.DataPartitionInfo) string { - return fmt.Sprintf(partitionInfoTablePattern, - partition.PartitionID, partition.VolName, partition.ReplicaNum, formatDataPartitionStatus(partition.Status), strings.Join(partition.Hosts, ", ")) + var sb = strings.Builder{} + sort.Strings(partition.Hosts) + sb.WriteString(fmt.Sprintf(partitionInfoTablePattern, + partition.PartitionID, partition.VolName, formatDataPartitionStatus(partition.Status), "Master", fmt.Sprintf("%v/%v", len(partition.Hosts), partition.ReplicaNum), strings.Join(partition.Hosts, "; "))) + return sb.String() } func formatMetaPartitionInfoRow(partition *proto.MetaPartitionInfo) string { - return fmt.Sprintf(partitionInfoTablePattern, - partition.PartitionID, partition.VolName, partition.ReplicaNum, formatDataPartitionStatus(partition.Status), strings.Join(partition.Hosts, ", ")) + var sb = strings.Builder{} + sb.WriteString(fmt.Sprintf(partitionInfoTablePattern, + partition.PartitionID, partition.VolName, formatDataPartitionStatus(partition.Status), "Master", fmt.Sprintf("%v/%v", len(partition.Hosts), partition.ReplicaNum), strings.Join(partition.Hosts, "; "))) + return sb.String() } func formatDataPartitionInfo(partition *proto.DataPartitionInfo) string { @@ -179,7 +199,7 @@ func formatDataPartitionInfo(partition *proto.DataPartitionInfo) string { sb.WriteString(fmt.Sprintf("Peers :\n")) sb.WriteString(fmt.Sprintf("%v\n", formatPeerTableHeader())) for _, peer := range partition.Peers { - sb.WriteString(fmt.Sprintf("%v\n", formatPeer( peer))) + sb.WriteString(fmt.Sprintf("%v\n", formatPeer(peer))) } sb.WriteString("\n") sb.WriteString(fmt.Sprintf("Hosts :\n")) @@ -223,7 +243,7 @@ func formatMetaPartitionInfo(partition *proto.MetaPartitionInfo) string { sb.WriteString("\n") sb.WriteString(fmt.Sprintf("Peers :\n")) for _, peer := range partition.Peers { - sb.WriteString(fmt.Sprintf("%v\n", formatPeer( peer))) + sb.WriteString(fmt.Sprintf("%v\n", formatPeer(peer))) } sb.WriteString("\n") sb.WriteString(fmt.Sprintf("Hosts :\n")) @@ -245,7 +265,7 @@ func formatMetaPartitionInfo(partition *proto.MetaPartitionInfo) string { } var ( - metaPartitionTablePattern = "%-8v %-12v %-10v %-12v %-12v %-12v %-8v %-12v %-18v" + metaPartitionTablePattern = "%-8v %-12v %-12v %-12v %-12v %-12v %-10v %-20v %-18v" metaPartitionTableHeader = fmt.Sprintf(metaPartitionTablePattern, "ID", "MAX INODE", "DENTRY COUNT", "INODE COUNT", "START", "END", "STATUS", "LEADER", "MEMBERS") ) @@ -361,11 +381,11 @@ func formatTime(timeUnix int64) string { return time.Unix(timeUnix, 0).Format("2006-01-02 15:04:05") } -func formatTimeToString(t time.Time) string{ +func formatTimeToString(t time.Time) string { return t.Format("2006-01-02 15:04:05") } -var dataReplicaTableRowPattern = "%-18v %-6v %-6v %-6v %-6v %-6v %-10v" +var dataReplicaTableRowPattern = "%-20v %-8v %-8v %-8v %-12v %-10v %-12v" func formatDataReplicaTableHeader() string { return fmt.Sprintf(dataReplicaTableRowPattern, "ADDRESS", "USED", "TOTAL", "ISLEADER", "FILECOUNT", "STATUS", "REPORT TIME") @@ -374,7 +394,7 @@ func formatDataReplicaTableHeader() string { func formatDataReplica(indentation string, replica *proto.DataReplica, rowTable bool) string { if rowTable { return fmt.Sprintf(dataReplicaTableRowPattern, replica.Addr, formatSize(replica.Used), formatSize(replica.Total), - replica.IsLeader, replica.FileCount, formatDataPartitionStatus(replica.Status), formatTime(replica.ReportTime)) + replica.IsLeader, replica.FileCount, formatDataPartitionStatus(replica.Status), formatTime(replica.ReportTime)) } var sb = strings.Builder{} sb.WriteString(fmt.Sprintf("%v- Addr : %v\n", indentation, replica.Addr)) @@ -390,7 +410,7 @@ func formatDataReplica(indentation string, replica *proto.DataReplica, rowTable return sb.String() } -var metaReplicaTableRowPattern = "%-18v %-6v %-6v %-10v" +var metaReplicaTableRowPattern = "%-20v %-8v %-10v %-12v" func formatMetaReplicaTableHeader() string { return fmt.Sprintf(metaReplicaTableRowPattern, "ADDRESS", "ISLEADER", "STATUS", "REPORT TIME") @@ -399,7 +419,7 @@ func formatMetaReplicaTableHeader() string { func formatMetaReplica(indentation string, replica *proto.MetaReplicaInfo, rowTable bool) string { if rowTable { return fmt.Sprintf(metaReplicaTableRowPattern, replica.Addr, replica.IsLeader, formatMetaPartitionStatus(replica.Status), - formatTime(replica.ReportTime)) + formatTime(replica.ReportTime)) } var sb = strings.Builder{} sb.WriteString(fmt.Sprintf("%v- Addr : %v\n", indentation, replica.Addr)) @@ -409,8 +429,6 @@ func formatMetaReplica(indentation string, replica *proto.MetaReplicaInfo, rowTa return sb.String() } - - var peerTableRowPattern = "%-6v %-18v" func formatPeerTableHeader() string { @@ -420,7 +438,6 @@ func formatPeer(peer proto.Peer) string { return fmt.Sprintf(peerTableRowPattern, peer.ID, peer.Addr) } - var dataNodeDetailTableRowPattern = "%-6v %-6v %-18v %-6v %-6v %-6v %-10v" func formatDataNodeDetailTableHeader() string { @@ -475,24 +492,37 @@ func formatMetaNodeDetail(mn *proto.MetaNodeInfo, rowTable bool) string { return sb.String() } -func formatZoneView(zv *proto.ZoneView) string { - var sb = strings.Builder{} - sb.WriteString(fmt.Sprintf("Zone Name: %v\n", zv.Name)) - sb.WriteString(fmt.Sprintf("Status: %v\n", zv.Status)) - sb.WriteString(fmt.Sprintf("\n")) - for index, ns := range zv.NodeSet { - sb.WriteString(fmt.Sprintf("NodeSet-%v:\n", index)) - sb.WriteString(fmt.Sprintf(" DataNodes[%v]:\n", ns.DataNodeLen)) - sb.WriteString(fmt.Sprintf(" %v\n", formatNodeViewTableHeader())) - for _, nv := range ns.DataNodes{ - sb.WriteString(fmt.Sprintf(" %v\n", formatNodeView(&nv, true))) +func contains(arr []string, element string) (ok bool) { + if arr == nil || len(arr) == 0 { + return + } + + for _, e := range arr { + if e == element { + ok = true + break } - sb.WriteString(fmt.Sprintf("\n")) - sb.WriteString(fmt.Sprintf(" MetaNodes[%v]:\n", ns.MetaNodeLen)) - sb.WriteString(fmt.Sprintf(" %v\n", formatNodeViewTableHeader())) - for _, nv := range ns.MetaNodes{ - sb.WriteString(fmt.Sprintf(" %v\n", formatNodeView(&nv, true))) + } + return +} +func convertPeersToArray(peers []*proto.Peer) (addrs []string) { + addrs = make([]string, 0) + for _, peer := range peers { + addrs = append(addrs, peer.Addr) + } + return +} + +func isEqualStrings(strs1, strs2 []string) bool { + sort.Strings(strs1) + sort.Strings(strs2) + if len(strs1) != len(strs2) { + return false + } + for i, s := range strs1 { + if strs2[i] != s { + return false } } - return sb.String() + return true } diff --git a/cli/cmd/http_service.go b/cli/cmd/http_service.go deleted file mode 100644 index 3748ff9432..0000000000 --- a/cli/cmd/http_service.go +++ /dev/null @@ -1,50 +0,0 @@ -package cmd - -import ( - "github.com/chubaofs/chubaofs/sdk/master" - "github.com/chubaofs/chubaofs/proto" -) - -type clientHandler interface { - excuteHttp() (err error) -} - -type volumeClient struct { - name string - capacity uint64 - opCode MasterOp - client *master.MasterClient -} - -func NewVolumeClient(opCode MasterOp, client *master.MasterClient) (vol *volumeClient){ - vol = new(volumeClient) - vol.opCode = opCode - vol.client = client - return -} - -func (vol *volumeClient) excuteHttp() (err error) { - switch vol.opCode { - case OpExpandVol: - var vv *proto.SimpleVolView - if vv, err = vol.client.AdminAPI().GetVolumeSimpleInfo(vol.name); err != nil { - return - } - if err = vol.client.AdminAPI().VolExpand(vol.name, vol.capacity, calcAuthKey(vv.Owner)); err != nil { - return - } - case OpShrinkVol: - var vv *proto.SimpleVolView - if vv, err = vol.client.AdminAPI().GetVolumeSimpleInfo(vol.name); err != nil { - return - } - if err = vol.client.AdminAPI().VolShrink(vol.name, vol.capacity, calcAuthKey(vv.Owner)); err != nil { - return - } - case OpDeleteVol: - default: - - } - - return -} diff --git a/cli/cmd/metanode.go b/cli/cmd/metanode.go index 4459c15255..6e7f0e5bf5 100644 --- a/cli/cmd/metanode.go +++ b/cli/cmd/metanode.go @@ -15,6 +15,7 @@ package cmd import ( + "os" "sort" "strings" @@ -58,7 +59,8 @@ func newMetaNodeListCmd(client *master.MasterClient) *cobra.Command { var err error defer func() { if err != nil { - errout("Error: %v", err) + errout("List cluster meta nodes failed: %v\n", err) + os.Exit(1) } }() var view *proto.ClusterView @@ -99,7 +101,8 @@ func newMetaNodeInfoCmd(client *master.MasterClient) *cobra.Command { var metanodeInfo *proto.MetaNodeInfo defer func() { if err != nil { - errout("Error: %v", err) + errout("Show meta node info failed: %v\n", err) + os.Exit(1) } }() nodeAddr = args[0] @@ -129,7 +132,8 @@ func newMetaNodeDecommissionCmd(client *master.MasterClient) *cobra.Command { var nodeAddr string defer func() { if err != nil { - errout("Error: %v", err) + errout("decommission meta node failed: %v\n", err) + os.Exit(1) } }() nodeAddr = args[0] diff --git a/cli/cmd/metapartition.go b/cli/cmd/metapartition.go index c735842102..aaf0b46c3b 100644 --- a/cli/cmd/metapartition.go +++ b/cli/cmd/metapartition.go @@ -21,6 +21,9 @@ import ( "github.com/spf13/cobra" "sort" "strconv" + "strings" + "sync" + "time" ) const ( @@ -44,12 +47,12 @@ func newMetaPartitionCmd(client *master.MasterClient) *cobra.Command { } const ( - cmdMetaPartitionGetShort = "Display detail information of a meta partition" - cmdCheckCorruptMetaPartitionShort = "Check out corrupt meta partitions" - cmdMetaPartitionDecommissionShort = "Decommission a replication of the meta partition to a new address" - cmdMetaPartitionReplicateShort = "Add a replication of the meta partition on a new address" - cmdMetaPartitionDeleteReplicaShort = "Delete a replication of the meta partition on a fixed address" - ) + cmdMetaPartitionGetShort = "Display detail information of a meta partition" + cmdCheckCorruptMetaPartitionShort = "Check out corrupt meta partitions" + cmdMetaPartitionDecommissionShort = "Decommission a replication of the meta partition to a new address" + cmdMetaPartitionReplicateShort = "Add a replication of the meta partition on a new address" + cmdMetaPartitionDeleteReplicaShort = "Delete a replication of the meta partition on a fixed address" +) func newMetaPartitionGetCmd(client *master.MasterClient) *cobra.Command { var cmd = &cobra.Command{ @@ -58,16 +61,10 @@ func newMetaPartitionGetCmd(client *master.MasterClient) *cobra.Command { Args: cobra.MinimumNArgs(1), Run: func(cmd *cobra.Command, args []string) { var ( - err error - partitionID uint64 partition *proto.MetaPartitionInfo ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() - if partitionID, err = strconv.ParseUint(args[0], 10, 64); err != nil { + partitionID, err := strconv.ParseUint(args[0], 10, 64) + if err != nil { return } if partition, err = client.ClientAPI().GetMetaPartition(partitionID); err != nil { @@ -80,6 +77,7 @@ func newMetaPartitionGetCmd(client *master.MasterClient) *cobra.Command { } func newListCorruptMetaPartitionCmd(client *master.MasterClient) *cobra.Command { + var optCheckAll bool var cmd = &cobra.Command{ Use: CliOpCheck, Short: cmdCheckCorruptMetaPartitionShort, @@ -91,18 +89,20 @@ the corrupt nodes, the few remaining replicas can not reach an agreement with on "reset" command will be released in next version.`, Run: func(cmd *cobra.Command, args []string) { var ( - diagnosis *proto.MetaPartitionDiagnosis - metaNodes []*proto.MetaNodeInfo - err error + diagnosis *proto.MetaPartitionDiagnosis + metaNodes []*proto.MetaNodeInfo + err error ) - defer func() { + if optCheckAll { + err = checkAllMetaPartitions(client) if err != nil { - errout("Error: %v", err) + stdout("%v\n", err) } - }() - if diagnosis, err = client.AdminAPI().DiagnoseMetaPartition(); err != nil { return } + if diagnosis, err = client.AdminAPI().DiagnoseMetaPartition(); err != nil { + stdout("%v\n", err) + } stdout("[Inactive Meta nodes]:\n") stdout("%v\n", formatMetaNodeDetailTableHeader()) sort.SliceStable(diagnosis.InactiveMetaNodes, func(i, j int) bool { @@ -129,65 +129,146 @@ the corrupt nodes, the few remaining replicas can not reach an agreement with on for _, pid := range diagnosis.CorruptMetaPartitionIDs { var partition *proto.MetaPartitionInfo if partition, err = client.ClientAPI().GetMetaPartition(pid); err != nil { - err = fmt.Errorf("Partition not found, err:[%v] ", err) + stdout("Partition not found, err:[%v]", err) return } stdout("%v\n", formatMetaPartitionInfoRow(partition)) } stdout("\n") - stdout("%v\n", "[Meta partition lack replicas]:") + stdout("%v\n", "[Partition lack replicas]:") stdout("%v\n", partitionInfoTableHeader) sort.SliceStable(diagnosis.LackReplicaMetaPartitionIDs, func(i, j int) bool { return diagnosis.LackReplicaMetaPartitionIDs[i] < diagnosis.LackReplicaMetaPartitionIDs[j] }) for _, pid := range diagnosis.LackReplicaMetaPartitionIDs { var partition *proto.MetaPartitionInfo - if partition, err = client.ClientAPI().GetMetaPartition( pid); err != nil { - err = fmt.Errorf("Partition not found, err:[%v] ", err) + if partition, err = client.ClientAPI().GetMetaPartition(pid); err != nil { + stdout("Partition not found, err:[%v]", err) return } if partition != nil { stdout("%v\n", formatMetaPartitionInfoRow(partition)) - } - } - - stdout("\n") - stdout("%v\n", "[Bad meta partitions(decommission not completed)]:") - badPartitionTablePattern := "%-8v %-10v\n" - stdout(badPartitionTablePattern, "PATH", "PARTITION ID") - for _, bmpv := range diagnosis.BadMetaPartitionIDs { - sort.SliceStable(bmpv.PartitionIDs, func(i, j int) bool { - return bmpv.PartitionIDs[i] < bmpv.PartitionIDs[j] - }) - for _, pid := range bmpv.PartitionIDs { - stdout(badPartitionTablePattern, bmpv.Path, pid) + sort.Strings(partition.Hosts) + for _, r := range partition.Replicas { + var mnPartition *proto.MNMetaPartitionInfo + var err error + addr := strings.Split(r.Addr, ":")[0] + if mnPartition, err = client.NodeAPI().MetaNodeGetPartition(addr, partition.PartitionID); err != nil { + fmt.Printf(partitionInfoColorTablePattern+"\n", + "", "", "", r.Addr, fmt.Sprintf("%v/%v", 0, partition.ReplicaNum), "no data") + continue + } + mnHosts := make([]string, 0) + for _, peer := range mnPartition.Peers { + mnHosts = append(mnHosts, peer.Addr) + } + sort.Strings(mnHosts) + fmt.Printf(partitionInfoColorTablePattern+"\n", + "", "", "", r.Addr, fmt.Sprintf("%v/%v", len(mnPartition.Peers), partition.ReplicaNum), strings.Join(mnHosts, "; ")) + } + fmt.Printf("\033[1;40;32m%-8v\033[0m", strings.Repeat("_ ", len(partitionInfoTableHeader)/2+5)+"\n") } } return }, } + cmd.Flags().BoolVar(&optCheckAll, "all", false, "true - check all partitions; false - only check partitions which lack of replica") return cmd } +func checkAllMetaPartitions(client *master.MasterClient) (err error) { + var volInfo []*proto.VolInfo + if volInfo, err = client.AdminAPI().ListVols(""); err != nil { + stdout("%v\n", err) + return + } + stdout("\n") + stdout("%v\n", "[Partition peer info not valid]:") + stdout("%v\n", partitionInfoTableHeader) + for _, vol := range volInfo { + var volView *proto.VolView + if volView, err = client.ClientAPI().GetVolume(vol.Name, calcAuthKey(vol.Owner)); err != nil { + stdout("Found an invalid vol: %v\n", vol.Name) + continue + } + sort.SliceStable(volView.MetaPartitions, func(i, j int) bool { + return volView.MetaPartitions[i].PartitionID < volView.MetaPartitions[j].PartitionID + }) + var wg sync.WaitGroup + for _, mp := range volView.MetaPartitions { + wg.Add(1) + go func(mp *proto.MetaPartitionView) { + defer wg.Done() + var outPut string + var isHealthy bool + outPut, isHealthy, _ = checkMetaPartition(mp.PartitionID, client) + if !isHealthy { + fmt.Printf(outPut) + stdoutGreen(strings.Repeat("_ ", len(partitionInfoTableHeader)/2+20) + "\n") + } + time.Sleep(time.Millisecond * 10) + }(mp) + } + wg.Wait() + } + return +} +func checkMetaPartition(pid uint64, client *master.MasterClient) (outPut string, isHealthy bool, err error) { + var partition *proto.MetaPartitionInfo + var sb = strings.Builder{} + isHealthy = true + if partition, err = client.ClientAPI().GetMetaPartition(pid); err != nil { + sb.WriteString(fmt.Sprintf("Partition is not found, err:[%v]", err)) + return + } + if partition != nil { + sb.WriteString(fmt.Sprintf("%v\n", formatMetaPartitionInfoRow(partition))) + sort.Strings(partition.Hosts) + if len(partition.MissNodes) > 0 || partition.Status == -1 || len(partition.Hosts) != int(partition.ReplicaNum) { + errMsg := fmt.Sprintf("The partition is unhealthy according to the report message from master") + sb.WriteString(fmt.Sprintf("\033[1;40;31m%-8v\033[0m\n", errMsg)) + isHealthy = false + } + for _, r := range partition.Replicas { + var mnPartition *proto.MNMetaPartitionInfo + var err error + addr := strings.Split(r.Addr, ":")[0] + if mnPartition, err = client.NodeAPI().MetaNodeGetPartition(addr, partition.PartitionID); err != nil { + sb.WriteString(fmt.Sprintf(partitionInfoColorTablePattern+"\n", + "", "", "", fmt.Sprintf("%v", r.Addr), fmt.Sprintf("%v/%v", "nil", partition.ReplicaNum), fmt.Sprintf("get partition info failed, err:%v", err))) + isHealthy = false + continue + } + peerStrings := convertPeersToArray(mnPartition.Peers) + sort.Strings(peerStrings) + sb.WriteString(fmt.Sprintf(partitionInfoColorTablePattern+"\n", + "", "", "", fmt.Sprintf("%v(peers)", r.Addr), fmt.Sprintf("%v/%v", len(peerStrings), partition.ReplicaNum), strings.Join(peerStrings, "; "))) + if !isEqualStrings(partition.Hosts, peerStrings) { + isHealthy = false + } + if len(peerStrings) != int(partition.ReplicaNum) { + isHealthy = false + } + } + } + outPut = sb.String() + return +} func newMetaPartitionDecommissionCmd(client *master.MasterClient) *cobra.Command { var cmd = &cobra.Command{ Use: CliOpDecommission + " [ADDRESS] [META PARTITION ID]", Short: cmdMetaPartitionDecommissionShort, Args: cobra.MinimumNArgs(2), Run: func(cmd *cobra.Command, args []string) { - var ( - err error - partitionID uint64 - ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() address := args[0] - partitionID, err = strconv.ParseUint(args[1], 10, 64) + partitionID, err := strconv.ParseUint(args[1], 10, 64) + if err != nil { + stdout("%v\n", err) + return + } if err = client.AdminAPI().DecommissionMetaPartition(partitionID, address); err != nil { + stdout("%v\n", err) return } }, @@ -207,18 +288,14 @@ func newMetaPartitionReplicateCmd(client *master.MasterClient) *cobra.Command { Short: cmdMetaPartitionReplicateShort, Args: cobra.MinimumNArgs(2), Run: func(cmd *cobra.Command, args []string) { - var ( - err error - partitionID uint64 - ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() address := args[0] - partitionID, err = strconv.ParseUint(args[1], 10, 64) + partitionID, err := strconv.ParseUint(args[1], 10, 64) + if err != nil { + stdout("%v\n", err) + return + } if err = client.AdminAPI().AddMetaReplica(partitionID, address); err != nil { + stdout("%v\n", err) return } }, @@ -238,21 +315,14 @@ func newMetaPartitionDeleteReplicaCmd(client *master.MasterClient) *cobra.Comman Short: cmdMetaPartitionDeleteReplicaShort, Args: cobra.MinimumNArgs(2), Run: func(cmd *cobra.Command, args []string) { - var ( - err error - partitionID uint64 - ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() address := args[0] - partitionID, err = strconv.ParseUint(args[1], 10, 64) + partitionID, err := strconv.ParseUint(args[1], 10, 64) if err != nil { + stdout("%v\n", err) return } if err = client.AdminAPI().DeleteMetaReplica(partitionID, address); err != nil { + stdout("%v\n", err) return } }, diff --git a/cli/cmd/root.go b/cli/cmd/root.go index 17332e848c..10adb35b02 100644 --- a/cli/cmd/root.go +++ b/cli/cmd/root.go @@ -16,11 +16,9 @@ package cmd import ( "fmt" - "github.com/chubaofs/chubaofs/util/log" "os" "path" - - "github.com/chubaofs/chubaofs/proto" + "strings" "github.com/chubaofs/chubaofs/sdk/master" "github.com/spf13/cobra" @@ -35,23 +33,14 @@ type ChubaoFSCmd struct { } func NewRootCmd(client *master.MasterClient) *ChubaoFSCmd { - var optShowVersion bool var cmd = &ChubaoFSCmd{ CFSCmd: &cobra.Command{ Use: path.Base(os.Args[0]), Short: cmdRootShort, Args: cobra.MinimumNArgs(0), - Run: func(cmd *cobra.Command, args []string) { - if optShowVersion { - stdout(proto.DumpVersion("CLI")) - return - } - }, }, } - cmd.CFSCmd.Flags().BoolVarP(&optShowVersion, "version", "v", false, "Show version information") - cmd.CFSCmd.AddCommand( cmd.newClusterCmd(client), newVolCmd(client), @@ -62,7 +51,6 @@ func NewRootCmd(client *master.MasterClient) *ChubaoFSCmd { newMetaPartitionCmd(client), newConfigCmd(), newCompatibilityCmd(), - newZoneCmd(client), ) return cmd } @@ -71,13 +59,15 @@ func stdout(format string, a ...interface{}) { _, _ = fmt.Fprintf(os.Stdout, format, a...) } -func errout(format string, a ...interface{}) { - log.LogErrorf(format + "\n", a...) - _, _ = fmt.Fprintf(os.Stderr, format, a...) - OsExitWithLogFlush() +func stdoutGreen(str string) { + fmt.Printf("\033[1;40;32m%-8v\033[0m\n", str) } -func OsExitWithLogFlush() { - log.LogFlush() - os.Exit(1) +func stdoutRed(str string) { + fmt.Printf("\033[1;40;31m%-8v\033[0m\n", str) + stdoutGreen(strings.Repeat("_ ", len(partitionInfoTableHeader)/2+10) + "\n") +} + +func errout(format string, a ...interface{}) { + _, _ = fmt.Fprintf(os.Stderr, format, a...) } diff --git a/cli/cmd/user.go b/cli/cmd/user.go index 10d4b4d7ac..eaa26a40e6 100644 --- a/cli/cmd/user.go +++ b/cli/cmd/user.go @@ -16,6 +16,7 @@ package cmd import ( "fmt" + "os" "strings" "github.com/chubaofs/chubaofs/proto" @@ -67,14 +68,10 @@ func newUserCreateCmd(client *master.MasterClient) *cobra.Command { var accessKey = optAccessKey var secretKey = optSecretKey var userType = proto.UserTypeFromString(optUserType) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() + if !userType.Valid() { - err = fmt.Errorf("Invalid user type. ") - return + errout("Invalid user type.") + os.Exit(1) } // ask user for confirm @@ -101,7 +98,7 @@ func newUserCreateCmd(client *master.MasterClient) *cobra.Command { var userConfirm string _, _ = fmt.Scanln(&userConfirm) if userConfirm != "yes" && len(userConfirm) != 0 { - err = fmt.Errorf("Abort by user.\n") + stdout("Abort by user.\n") return } } @@ -115,8 +112,8 @@ func newUserCreateCmd(client *master.MasterClient) *cobra.Command { } var userInfo *proto.UserInfo if userInfo, err = client.UserAPI().CreateUser(¶m); err != nil { - err = fmt.Errorf("Create user failed: %v\n", err) - return + errout("Create user failed: %v\n", err) + os.Exit(1) } // display operation result @@ -153,16 +150,11 @@ func newUserUpdateCmd(client *master.MasterClient) *cobra.Command { var accessKey = optAccessKey var secretKey = optSecretKey var userType proto.UserType - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() if optUserType != "" { userType = proto.UserTypeFromString(optUserType) if !userType.Valid() { - err = fmt.Errorf("Invalid user type ") - return + errout("Invalid user type.\n") + os.Exit(1) } } @@ -188,12 +180,14 @@ func newUserUpdateCmd(client *master.MasterClient) *cobra.Command { var userConfirm string _, _ = fmt.Scanln(&userConfirm) if userConfirm != "yes" && len(userConfirm) != 0 { - err = fmt.Errorf("Abort by user.\n") + stdout("Abort by user.\n") + os.Exit(1) return } } if accessKey == "" && secretKey == "" && optUserType == "" { - err = fmt.Errorf("no update") + stdout("No update.\n") + os.Exit(1) return } var param = proto.UserUpdateParam{ @@ -204,7 +198,8 @@ func newUserUpdateCmd(client *master.MasterClient) *cobra.Command { } var userInfo *proto.UserInfo if userInfo, err = client.UserAPI().UpdateUser(¶m); err != nil { - return + errout("Update user failed: %v\n", err) + os.Exit(1) } stdout("Update user success:\n") @@ -234,24 +229,21 @@ func newUserDeleteCmd(client *master.MasterClient) *cobra.Command { Run: func(cmd *cobra.Command, args []string) { var err error var userID = args[0] - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() + if !optYes { stdout("Delete user [%v] (yes/no)[no]:", userID) var userConfirm string _, _ = fmt.Scanln(&userConfirm) if userConfirm != "yes" { - err = fmt.Errorf("Abort by user.\n") + stdout("Abort by user.\n") + os.Exit(1) return } } if err = client.UserAPI().DeleteUser(userID); err != nil { - err = fmt.Errorf("Delete user failed:\n%v\n", err) - return + errout("Delete user failed:\n%v\n", err) + os.Exit(1) } stdout("Delete user success.\n") return @@ -282,14 +274,9 @@ func newUserInfoCmd(client *master.MasterClient) *cobra.Command { var err error var userID = args[0] var userInfo *proto.UserInfo - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() if userInfo, err = client.UserAPI().GetUserInfo(userID); err != nil { - err = fmt.Errorf("Get user info failed: %v\n", err) - return + errout("Get user info failed: %v\n", err) + os.Exit(1) } printUserInfo(userInfo) }, @@ -310,42 +297,28 @@ const ( ) func newUserPermCmd(client *master.MasterClient) *cobra.Command { - var subdir string var cmd = &cobra.Command{ Use: cmdUserPermUse, Short: cmdUserPermShort, Args: cobra.MinimumNArgs(3), Run: func(cmd *cobra.Command, args []string) { - var err error var userID = args[0] var volume = args[1] var perm proto.Permission - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() - - perm = proto.BuiltinPermissionPrefix - if subdir != "" && subdir != "/" { - perm = proto.Permission(string(perm) + subdir + ":") - } - switch strings.ToLower(args[2]) { case "ro", "readonly": - perm = perm + "ReadOnly" + perm = proto.BuiltinPermissionReadOnly case "rw", "readwrite": - perm = perm + "Writable" + perm = proto.BuiltinPermissionWritable case "none": perm = proto.NonePermission default: - err = fmt.Errorf("Permission must be on of ro, rw, none ") + stdout("Permission must be on of ro, rw, none") return } stdout("Setup volume permission\n") stdout(" User ID : %v\n", userID) stdout(" Volume : %v\n", volume) - stdout(" Subdir : %v\n", subdir) stdout(" Permission: %v\n", perm.ReadableString()) // ask user for confirm @@ -353,9 +326,16 @@ func newUserPermCmd(client *master.MasterClient) *cobra.Command { var userConfirm string _, _ = fmt.Scanln(&userConfirm) if userConfirm != "yes" && len(userConfirm) != 0 { - err = fmt.Errorf("Abort by user.\n") + stdout("Abort by user.\n") return } + var err error + defer func() { + if err != nil { + errout("Setup permission failed:\n%v\n", err) + os.Exit(1) + } + }() var userInfo *proto.UserInfo if userInfo, err = client.UserAPI().GetUserInfo(userID); err != nil { return @@ -383,7 +363,6 @@ func newUserPermCmd(client *master.MasterClient) *cobra.Command { return validUsers(client, toComplete), cobra.ShellCompDirectiveNoFileComp }, } - cmd.Flags().StringVar(&subdir, "subdir", "", "Subdir") return cmd } @@ -402,7 +381,8 @@ func newUserListCmd(client *master.MasterClient) *cobra.Command { var err error defer func() { if err != nil { - errout("Error: %v", err) + errout("List cluster user failed: %v\n", err) + os.Exit(1) } }() if users, err = client.UserAPI().ListUsers(optKeyword); err != nil { @@ -437,3 +417,18 @@ func printUserInfo(userInfo *proto.UserInfo) { stdout("%-20v %-12v\n", vol, strings.Join(perms, ",")) } } + +func validUsers(client *master.MasterClient, toComplete string) []string { + var ( + validUsers []string + users []*proto.UserInfo + err error + ) + if users, err = client.UserAPI().ListUsers(toComplete); err != nil { + errout("Get user list failed:\n%v\n", err) + } + for _, user := range users { + validUsers = append(validUsers, user.UserID) + } + return validUsers +} diff --git a/cli/cmd/valid.go b/cli/cmd/valid.go index 1878000605..b8f5ed46ea 100644 --- a/cli/cmd/valid.go +++ b/cli/cmd/valid.go @@ -16,19 +16,17 @@ package cmd import ( "github.com/chubaofs/chubaofs/proto" - sdk "github.com/chubaofs/chubaofs/sdk/master" + "github.com/chubaofs/chubaofs/sdk/master" ) -func validVols(client, complete interface{}) []string { +func validVols(client *master.MasterClient, toComplete string) []string { var ( validVols []string vols []*proto.VolInfo err error ) - clientSdk := client.(*sdk.MasterClient) - completeStr := complete.(string) - if vols, err = clientSdk.AdminAPI().ListVols(completeStr); err != nil { - errout("Error: %v", err) + if vols, err = client.AdminAPI().ListVols(toComplete); err != nil { + errout("Get volume list failed:\n%v\n", err) } for _, vol := range vols { validVols = append(validVols, vol.Name) @@ -36,7 +34,7 @@ func validVols(client, complete interface{}) []string { return validVols } -func validDataNodes(client *sdk.MasterClient, toComplete string) []string { +func validDataNodes(client *master.MasterClient, toComplete string) []string { var ( validDataNodes []string clusterView *proto.ClusterView @@ -44,7 +42,7 @@ func validDataNodes(client *sdk.MasterClient, toComplete string) []string { err error ) if clusterView, err = client.AdminAPI().GetCluster(); err != nil { - errout("Error: %v", err) + errout("Get data node list failed:\n%v\n", err) } for _, dn := range clusterView.DataNodes { validDataNodes = append(validDataNodes, dn.Addr) @@ -52,47 +50,18 @@ func validDataNodes(client *sdk.MasterClient, toComplete string) []string { return validDataNodes } -func validMetaNodes(client *sdk.MasterClient, toComplete string) []string { +func validMetaNodes(client *master.MasterClient, toComplete string) []string { var ( validMetaNodes []string clusterView *proto.ClusterView + err error ) if clusterView, err = client.AdminAPI().GetCluster(); err != nil { - errout("Error: %v", err) + errout("Get meta node list failed:\n%v\n", err) } for _, mn := range clusterView.MetaNodes { validMetaNodes = append(validMetaNodes, mn.Addr) } return validMetaNodes } - -func validUsers(client *sdk.MasterClient, toComplete string) []string { - var ( - validUsers []string - users []*proto.UserInfo - err error - ) - if users, err = client.UserAPI().ListUsers(toComplete); err != nil { - errout("Error: %v", err) - } - for _, user := range users { - validUsers = append(validUsers, user.UserID) - } - return validUsers -} - -func validZones(client *sdk.MasterClient, toComplete string) []string { - var ( - validZones []string - zones []*proto.ZoneView - err error - ) - if zones, err = client.AdminAPI().ListZones(); err != nil { - errout("Error: %v", err) - } - for _, zone := range zones { - validZones = append(validZones, zone.Name) - } - return validZones -} diff --git a/cli/cmd/vol.go b/cli/cmd/vol.go index 446c7cb3f8..ec2487cbc9 100644 --- a/cli/cmd/vol.go +++ b/cli/cmd/vol.go @@ -18,10 +18,13 @@ import ( "crypto/md5" "encoding/hex" "fmt" + "os" "sort" "strconv" "strings" + "github.com/chubaofs/chubaofs/util/errors" + "github.com/chubaofs/chubaofs/proto" "github.com/chubaofs/chubaofs/sdk/master" "github.com/spf13/cobra" @@ -42,13 +45,11 @@ func newVolCmd(client *master.MasterClient) *cobra.Command { cmd.AddCommand( newVolListCmd(client), newVolCreateCmd(client), - newVolExpandCmd(client), - newVolShrinkCmd(client), - newVolSetCmd(client), newVolInfoCmd(client), newVolDeleteCmd(client), newVolTransferCmd(client), newVolAddDPCmd(client), + newVolSetCmd(client), ) return cmd } @@ -59,6 +60,7 @@ const ( func newVolListCmd(client *master.MasterClient) *cobra.Command { var optKeyword string + var optDetailMod bool var cmd = &cobra.Command{ Use: CliOpList, Short: cmdVolListShort, @@ -68,19 +70,34 @@ func newVolListCmd(client *master.MasterClient) *cobra.Command { var err error defer func() { if err != nil { - errout("Error: %v", err) + errout("List cluster volume failed:\n%v\n", err) + os.Exit(1) } }() if vols, err = client.AdminAPI().ListVols(optKeyword); err != nil { return } - stdout("%v\n", volumeInfoTableHeader) + if optDetailMod { + stdout("%v\n", volumeDetailInfoTableHeader) + } else { + stdout("%v\n", volumeInfoTableHeader) + } for _, vol := range vols { - stdout("%v\n", formatVolInfoTableRow(vol)) + var vv *proto.SimpleVolView + if vv, err = client.AdminAPI().GetVolumeSimpleInfo(vol.Name); err != nil { + return + } + if optDetailMod { + stdout("%v\n", formatVolDetailInfoTableRow(vv, vol)) + } else { + stdout("%v\n", formatVolInfoTableRow(vol)) + } } }, } + cmd.Flags().BoolVarP(&optDetailMod, "detail-mod", "d", false, "list the volumes with empty zone name") cmd.Flags().StringVar(&optKeyword, "keyword", "", "Specify keyword of volume name to filter") + return cmd } @@ -92,7 +109,7 @@ const ( cmdVolDefaultCapacity = 10 // 100GB cmdVolDefaultReplicas = 3 cmdVolDefaultFollowerReader = true - cmdVolDefaultZoneName = "default" + cmdVolDefaultZoneName = "default" ) func newVolCreateCmd(client *master.MasterClient) *cobra.Command { @@ -101,6 +118,7 @@ func newVolCreateCmd(client *master.MasterClient) *cobra.Command { var optCapacity uint64 var optReplicas int var optFollowerRead bool + var optAutoRepair bool var optYes bool var optZoneName string var cmd = &cobra.Command{ @@ -111,11 +129,7 @@ func newVolCreateCmd(client *master.MasterClient) *cobra.Command { var err error var volumeName = args[0] var userID = args[1] - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() + // ask user for confirm if !optYes { stdout("Create a new volume:\n") @@ -126,22 +140,22 @@ func newVolCreateCmd(client *master.MasterClient) *cobra.Command { stdout(" Capacity : %v GB\n", optCapacity) stdout(" Replicas : %v\n", optReplicas) stdout(" Allow follower read : %v\n", formatEnabledDisabled(optFollowerRead)) + stdout(" Auto repair : %v\n", formatEnabledDisabled(optAutoRepair)) + stdout(" ZoneName : %v\n", optZoneName) stdout("\nConfirm (yes/no)[yes]: ") var userConfirm string _, _ = fmt.Scanln(&userConfirm) if userConfirm != "yes" && len(userConfirm) != 0 { - err = fmt.Errorf("Abort by user.\n") + stdout("Abort by user.\n") return } } - err = client.AdminAPI().CreateVolume( - volumeName, userID, optMPCount, optDPSize, - optCapacity, optReplicas, optFollowerRead, optZoneName) + err = client.AdminAPI().CreateVolume(volumeName, userID, optMPCount, optDPSize, optCapacity, optReplicas, optFollowerRead, optAutoRepair, optZoneName) if err != nil { - err = fmt.Errorf("Create volume failed case:\n%v\n", err) - return + errout("Create volume failed case:\n%v\n", err) + os.Exit(1) } stdout("Create volume success.\n") return @@ -152,23 +166,31 @@ func newVolCreateCmd(client *master.MasterClient) *cobra.Command { cmd.Flags().Uint64Var(&optCapacity, CliFlagCapacity, cmdVolDefaultCapacity, "Specify volume capacity [Unit: GB]") cmd.Flags().IntVar(&optReplicas, CliFlagReplicas, cmdVolDefaultReplicas, "Specify data partition replicas number") cmd.Flags().BoolVar(&optFollowerRead, CliFlagEnableFollowerRead, cmdVolDefaultFollowerReader, "Enable read form replica follower") + cmd.Flags().BoolVar(&optAutoRepair, CliFlagAutoRepair, false, "Enable auto balance partition distribution according to zoneName") cmd.Flags().StringVar(&optZoneName, CliFlagZoneName, cmdVolDefaultZoneName, "Specify volume zone name") cmd.Flags().BoolVarP(&optYes, "yes", "y", false, "Answer yes for all questions") return cmd } + const ( - cmdVolSetShort = "Set configuration of the volume" + cmdVolInfoUse = "info [VOLUME NAME]" + cmdVolInfoShort = "Show volume information" + cmdVolSetShort = "Set configuration of the volume" ) + func newVolSetCmd(client *master.MasterClient) *cobra.Command { - var optCapacity uint64 - var optReplicas int - var optFollowerRead string - var optAuthenticate string - var optEnableToken string - var optZoneName string - var optYes bool - var confirmString = strings.Builder{} - var vv *proto.SimpleVolView + var ( + optCapacity uint64 + optReplicas int + optFollowerRead string + optAuthenticate string + optEnableToken string + optAutoRepair string + optZoneName string + optYes bool + confirmString = strings.Builder{} + vv *proto.SimpleVolView + ) var cmd = &cobra.Command{ Use: CliOpSet + " [VOLUME NAME]", Short: cmdVolSetShort, @@ -190,7 +212,7 @@ func newVolSetCmd(client *master.MasterClient) *cobra.Command { if optCapacity > 0 { isChange = true confirmString.WriteString(fmt.Sprintf(" Capacity : %v GB -> %v GB\n", vv.Capacity, optCapacity)) - vv.Capacity = optCapacity + vv.Capacity = optCapacity } else { confirmString.WriteString(fmt.Sprintf(" Capacity : %v GB\n", vv.Capacity)) } @@ -235,16 +257,24 @@ func newVolSetCmd(client *master.MasterClient) *cobra.Command { } else { confirmString.WriteString(fmt.Sprintf(" EnableToken : %v\n", formatEnabledDisabled(vv.EnableToken))) } - if vv.CrossZone == false && "" != optZoneName { + if optAutoRepair != "" { + isChange = true + var enable bool + if enable, err = strconv.ParseBool(optAutoRepair); err != nil { + return + } + confirmString.WriteString(fmt.Sprintf(" AutoRepair : %v -> %v\n", formatEnabledDisabled(vv.AutoRepair), formatEnabledDisabled(enable))) + vv.AutoRepair = enable + } else { + confirmString.WriteString(fmt.Sprintf(" AutoRepair : %v\n", formatEnabledDisabled(vv.AutoRepair))) + } + if "" != optZoneName { isChange = true confirmString.WriteString(fmt.Sprintf(" ZoneName : %v -> %v\n", vv.ZoneName, optZoneName)) vv.ZoneName = optZoneName } else { confirmString.WriteString(fmt.Sprintf(" ZoneName : %v\n", vv.ZoneName)) } - if vv.CrossZone == true && "" != optZoneName { - err = fmt.Errorf("Can not set zone name of the volume that cross zone\n") - } if err != nil { return } @@ -264,7 +294,7 @@ func newVolSetCmd(client *master.MasterClient) *cobra.Command { } } err = client.AdminAPI().UpdateVolume(vv.Name, vv.Capacity, int(vv.DpReplicaNum), - vv.FollowerRead, vv.Authenticate, vv.EnableToken, calcAuthKey(vv.Owner), vv.ZoneName) + vv.FollowerRead, vv.Authenticate, vv.EnableToken, vv.AutoRepair, calcAuthKey(vv.Owner), vv.ZoneName) if err != nil { return } @@ -285,14 +315,10 @@ func newVolSetCmd(client *master.MasterClient) *cobra.Command { cmd.Flags().StringVar(&optEnableToken, CliFlagEnableToken, "", "ReadOnly/ReadWrite token validation for fuse client") cmd.Flags().StringVar(&optZoneName, CliFlagZoneName, "", "Specify volume zone name") cmd.Flags().BoolVarP(&optYes, "yes", "y", false, "Answer yes for all questions") + cmd.Flags().StringVar(&optAutoRepair, CliFlagAutoRepair, "", "Enable auto balance partition distribution according to zoneName") + return cmd } - -const ( - cmdVolInfoUse = "info [VOLUME NAME]" - cmdVolInfoShort = "Show volume information" -) - func newVolInfoCmd(client *master.MasterClient) *cobra.Command { var ( optMetaDetail bool @@ -307,14 +333,10 @@ func newVolInfoCmd(client *master.MasterClient) *cobra.Command { var err error var volumeName = args[0] var svv *proto.SimpleVolView - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() + if svv, err = client.AdminAPI().GetVolumeSimpleInfo(volumeName); err != nil { - err = fmt.Errorf("Get volume info failed:\n%v\n", err) - return + errout("Get volume info failed:\n%v\n", err) + os.Exit(1) } // print summary info stdout("Summary:\n%s\n", formatSimpleVolView(svv)) @@ -323,8 +345,8 @@ func newVolInfoCmd(client *master.MasterClient) *cobra.Command { if optMetaDetail { var views []*proto.MetaPartitionView if views, err = client.ClientAPI().GetMetaPartitions(volumeName); err != nil { - err = fmt.Errorf("Get volume metadata detail information failed:\n%v\n", err) - return + errout("Get volume metadata detail information failed:\n%v\n", err) + os.Exit(1) } stdout("Meta partitions:\n") stdout("%v\n", metaPartitionTableHeader) @@ -340,8 +362,8 @@ func newVolInfoCmd(client *master.MasterClient) *cobra.Command { if optDataDetail { var view *proto.DataPartitionsView if view, err = client.ClientAPI().GetDataPartitions(volumeName); err != nil { - err = fmt.Errorf("Get volume data detail information failed:\n%v\n", err) - return + errout("Get volume data detail information failed:\n%v\n", err) + os.Exit(1) } stdout("Data partitions:\n") stdout("%v\n", dataPartitionTableHeader) @@ -382,31 +404,26 @@ func newVolDeleteCmd(client *master.MasterClient) *cobra.Command { Run: func(cmd *cobra.Command, args []string) { var err error var volumeName = args[0] - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() // ask user for confirm if !optYes { stdout("Delete volume [%v] (yes/no)[no]:", volumeName) var userConfirm string _, _ = fmt.Scanln(&userConfirm) if userConfirm != "yes" { - err = fmt.Errorf("Abort by user.\n") + stdout("Abort by user.\n") return } } var svv *proto.SimpleVolView if svv, err = client.AdminAPI().GetVolumeSimpleInfo(volumeName); err != nil { - err = fmt.Errorf("Delete volume failed:\n%v\n", err) - return + errout("Delete volume failed:\n%v\n", err) + os.Exit(1) } if err = client.AdminAPI().DeleteVolume(volumeName, calcAuthKey(svv.Owner)); err != nil { - err = fmt.Errorf("Delete volume failed:\n%v\n", err) - return + errout("Delete volume failed:\n%v\n", err) + os.Exit(1) } stdout("Delete volume success.\n") }, @@ -441,7 +458,8 @@ func newVolTransferCmd(client *master.MasterClient) *cobra.Command { defer func() { if err != nil { - errout("Error: %v", err) + errout("Transfer volume [%v] to user [%v] failed: %v\n", volume, userID, err) + os.Exit(1) } }() @@ -451,7 +469,7 @@ func newVolTransferCmd(client *master.MasterClient) *cobra.Command { var confirm string _, _ = fmt.Scanln(&confirm) if confirm != "yes" { - err = fmt.Errorf("Abort by user.\n") + stdout("Abort by user.\n") return } } @@ -501,7 +519,8 @@ func newVolAddDPCmd(client *master.MasterClient) *cobra.Command { var err error defer func() { if err != nil { - errout("Error: %v", err) + errout("Create data partition failed: %v\n", err) + os.Exit(1) } }() var count int64 @@ -509,7 +528,7 @@ func newVolAddDPCmd(client *master.MasterClient) *cobra.Command { return } if count < 1 { - err = fmt.Errorf("number must be larger than 0") + err = errors.New("number must be larger than 0") return } if err = client.AdminAPI().CreateDataPartition(volume, int(count)); err != nil { @@ -527,56 +546,6 @@ func newVolAddDPCmd(client *master.MasterClient) *cobra.Command { return cmd } -const ( - cmdExpandVolCmdShort = "Expand capacity of a volume" - cmdShrinkVolCmdShort = "Shrink capacity of a volume" -) - -func newVolExpandCmd(client *master.MasterClient) *cobra.Command { - volClient := NewVolumeClient(OpExpandVol, client) - return newVolSetCapacityCmd(CliOpExpand, cmdExpandVolCmdShort, volClient) -} - -func newVolShrinkCmd(client *master.MasterClient) *cobra.Command { - volClient := NewVolumeClient(OpShrinkVol, client) - return newVolSetCapacityCmd(CliOpShrink, cmdShrinkVolCmdShort, volClient) -} - -func newVolSetCapacityCmd(use, short string, r clientHandler) *cobra.Command { - var cmd = &cobra.Command{ - Use: use + " [VOLUME] [CAPACITY]", - Short: short, - Args: cobra.MinimumNArgs(2), - Run: func(cmd *cobra.Command, args []string) { - var name = args[0] - var capacityStr = args[1] - var err error - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() - volume := r.(*volumeClient) - if volume.capacity, err = strconv.ParseUint(capacityStr, 10, 64); err != nil { - return - } - volume.name = name - if err = volume.excuteHttp(); err != nil { - return - } - return - }, - ValidArgsFunction: func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - if len(args) != 0 { - return nil, cobra.ShellCompDirectiveNoFileComp - } - volume := r.(*volumeClient) - return validVols(volume.client, toComplete), cobra.ShellCompDirectiveNoFileComp - }, - } - return cmd -} - func calcAuthKey(key string) (authKey string) { h := md5.New() _, _ = h.Write([]byte(key)) diff --git a/cli/cmd/zone.go b/cli/cmd/zone.go deleted file mode 100644 index d0258d6596..0000000000 --- a/cli/cmd/zone.go +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright 2018 The Chubao Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the License. - -package cmd - -import ( - "fmt" - "github.com/chubaofs/chubaofs/proto" - sdk "github.com/chubaofs/chubaofs/sdk/master" - "github.com/spf13/cobra" -) - -const ( - cmdZoneUse = "zone [COMMAND]" - cmdZoneShort = "Manage zone" -) - -func newZoneCmd(client *sdk.MasterClient) *cobra.Command { - var cmd = &cobra.Command{ - Use: cmdZoneUse, - Short: cmdZoneShort, - Args: cobra.MinimumNArgs(0), - } - cmd.AddCommand( - newZoneListCmd(client), - newZoneInfoCmd(client), - ) - return cmd -} - -const ( - cmdZoneListShort = "List cluster zones" - cmdZoneInfoShort = "Show zone information" -) - -func newZoneListCmd(client *sdk.MasterClient) *cobra.Command { - var cmd = &cobra.Command{ - Use: CliOpList, - Short: cmdZoneListShort, - Aliases: []string{"ls"}, - Run: func(cmd *cobra.Command, args []string) { - var zones []*proto.ZoneView - var err error - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() - if zones, err = client.AdminAPI().ListZones(); err != nil { - return - } - zoneTablePattern := "%-8v %-10v\n" - stdout(zoneTablePattern, "ZONE", "STATUS") - for _, zone := range zones { - stdout(zoneTablePattern, zone.Name, zone.Status) - } - return - }, - } - return cmd -} - - -func newZoneInfoCmd(client *sdk.MasterClient) *cobra.Command { - var cmd = &cobra.Command{ - Use: CliOpInfo + " [NAME]", - Short: cmdZoneInfoShort, - Args: cobra.MinimumNArgs(1), - Run: func(cmd *cobra.Command, args []string) { - var topo *proto.TopologyView - var ( - err error - zoneName string - zoneView *proto.ZoneView - ) - defer func() { - if err != nil { - errout("Error: %v", err) - } - }() - zoneName = args[0] - if topo, err = client.AdminAPI().Topo(); err != nil { - return - } - - for _, zone := range topo.Zones { - if zoneName == zone.Name { - zoneView = zone - } - } - if zoneView == nil { - err = fmt.Errorf("Zone[%v] not exists in cluster\n ", zoneName) - return - } - stdout(formatZoneView(zoneView)) - return - }, - ValidArgsFunction: func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { - if len(args) != 0 { - return nil, cobra.ShellCompDirectiveNoFileComp - } - return validZones(client, toComplete), cobra.ShellCompDirectiveNoFileComp - }, - } - return cmd -} diff --git a/datanode/data_partition_repair.go b/datanode/data_partition_repair.go index 39309a85d7..59aa633ac0 100644 --- a/datanode/data_partition_repair.go +++ b/datanode/data_partition_repair.go @@ -413,7 +413,7 @@ func (dp *DataPartition) notifyFollower(wg *sync.WaitGroup, index int, members [ if err = p.WriteToConn(conn); err != nil { return err } - if err = p.ReadFromConn(conn, proto.NoReadDeadlineTime); err != nil { + if err = p.ReadFromConn(conn, proto.MaxWaitFollowerRepairTime); err != nil { return err } return err diff --git a/datanode/disk.go b/datanode/disk.go index e092499bc4..aea77d1f84 100644 --- a/datanode/disk.go +++ b/datanode/disk.go @@ -26,10 +26,11 @@ import ( "syscall" "time" + "os" + "github.com/chubaofs/chubaofs/proto" "github.com/chubaofs/chubaofs/util/exporter" "github.com/chubaofs/chubaofs/util/log" - "os" ) var ( diff --git a/datanode/partition.go b/datanode/partition.go index 53f335b4aa..9d0c9d923a 100644 --- a/datanode/partition.go +++ b/datanode/partition.go @@ -794,6 +794,10 @@ func (dp *DataPartition) doStreamFixTinyDeleteRecord(repairTask *DataPartitionRe // ChangeRaftMember is a wrapper function of changing the raft member. func (dp *DataPartition) ChangeRaftMember(changeType raftProto.ConfChangeType, peer raftProto.Peer, context []byte) (resp interface{}, err error) { + log.LogErrorf("[DataPartition->ChangeRaftMember] [partitionID: %v] start [changeType: %v, peer: %v]", dp.partitionID, changeType, peer) + defer func() { + log.LogErrorf("[DataPartition->ChangeRaftMember] [partitionID: %v] finish [changeType: %v, peer: %v]", dp.partitionID, changeType, peer) + }() resp, err = dp.raftPartition.ChangeMember(changeType, peer, context) return } diff --git a/datanode/partition_raft.go b/datanode/partition_raft.go index 9198587347..1e7013d138 100644 --- a/datanode/partition_raft.go +++ b/datanode/partition_raft.go @@ -361,8 +361,12 @@ func (dp *DataPartition) removeRaftNode(req *proto.RemoveDataPartitionRaftMember } dp.config.Peers = append(dp.config.Peers[:peerIndex], dp.config.Peers[peerIndex+1:]...) if dp.config.NodeID == req.RemovePeer.ID && !dp.isLoadingDataPartition && canRemoveSelf { - dp.raftPartition.Delete() - dp.Disk().space.DeletePartition(dp.partitionID) + if req.ReserveResource { + dp.Disk().space.DeletePartitionFromCache(dp.partitionID) + } else { + dp.raftPartition.Expired() + dp.Disk().space.ExpiredPartition(dp.partitionID) + } isUpdated = false } log.LogInfof("Fininsh RemoveRaftNode PartitionID(%v) nodeID(%v) do RaftLog (%v) ", @@ -475,6 +479,7 @@ func (s *DataNode) startRaftServer(cfg *config.Config) (err error) { raftConf := &raftstore.Config{ NodeID: s.nodeID, RaftPath: s.raftDir, + TickInterval: s.tickInterval, IPAddr: LocalIP, HeartbeatPort: heartbeatPort, ReplicaPort: replicatePort, diff --git a/datanode/partition_raftfsm.go b/datanode/partition_raftfsm.go index c5483b40d6..cb6719846f 100644 --- a/datanode/partition_raftfsm.go +++ b/datanode/partition_raftfsm.go @@ -44,6 +44,8 @@ func (dp *DataPartition) ApplyMemberChange(confChange *raftproto.ConfChange, ind dp.uploadApplyID(index) }(index) + defer log.LogErrorf("[DataPartition->ApplyMemberChange] [partitionID: %v] finish apply [index: %v, changeType: %v, peer: %v]", + dp.partitionID, index, confChange.Type, confChange.Peer) // Change memory the status var ( isUpdated bool diff --git a/datanode/server.go b/datanode/server.go index 2a9e37f00f..fa43a95cd4 100644 --- a/datanode/server.go +++ b/datanode/server.go @@ -65,14 +65,15 @@ const ( ) const ( - ConfigKeyLocalIP = "localIP" // string - ConfigKeyPort = "port" // int - ConfigKeyMasterAddr = "masterAddr" // array - ConfigKeyZone = "zoneName" // string - ConfigKeyDisks = "disks" // array - ConfigKeyRaftDir = "raftDir" // string - ConfigKeyRaftHeartbeat = "raftHeartbeat" // string - ConfigKeyRaftReplica = "raftReplica" // string + ConfigKeyLocalIP = "localIP" // string + ConfigKeyPort = "port" // int + ConfigKeyMasterAddr = "masterAddr" // array + ConfigKeyZone = "zoneName" // string + ConfigKeyDisks = "disks" // array + ConfigKeyRaftDir = "raftDir" // string + ConfigKeyRaftHeartbeat = "raftHeartbeat" // string + ConfigKeyRaftReplica = "raftReplica" // string + cfgTickIntervalMs = "tickIntervalMs" // int ) // DataNode defines the structure of a data node. @@ -88,6 +89,7 @@ type DataNode struct { raftHeartbeat string raftReplica string raftStore raftstore.RaftStore + tickInterval int tcpListener net.Listener stopC chan bool @@ -197,6 +199,12 @@ func (s *DataNode) parseConfig(cfg *config.Config) (err error) { s.zoneName = DefaultZoneName } + s.tickInterval = int(cfg.GetFloat(cfgTickIntervalMs)) + if s.tickInterval <= 300 { + log.LogWarnf("get config [%s]:[%v] less than 300 so set it to 500 ", cfgTickIntervalMs, cfg.GetString(cfgTickIntervalMs)) + s.tickInterval = 500 + } + log.LogDebugf("action[parseConfig] load masterAddrs(%v).", MasterClient.Nodes()) log.LogDebugf("action[parseConfig] load port(%v).", s.port) log.LogDebugf("action[parseConfig] load zoneName(%v).", s.zoneName) diff --git a/datanode/server_handler.go b/datanode/server_handler.go index 28e082d058..d76a83a1a2 100644 --- a/datanode/server_handler.go +++ b/datanode/server_handler.go @@ -181,6 +181,7 @@ func (s *DataNode) getPartitionAPI(w http.ResponseWriter, r *http.Request) { Files []*storage.ExtentInfo `json:"extents"` FileCount int `json:"fileCount"` Replicas []string `json:"replicas"` + Peers []proto.Peer `json:"peers"` TinyDeleteRecordSize int64 `json:"tinyDeleteRecordSize"` RaftStatus *raft.Status `json:"raftStatus"` }{ @@ -195,6 +196,7 @@ func (s *DataNode) getPartitionAPI(w http.ResponseWriter, r *http.Request) { Replicas: partition.Replicas(), TinyDeleteRecordSize: tinyDeleteRecordSize, RaftStatus: partition.raftPartition.Status(), + Peers: partition.config.Peers, } s.buildSuccessResp(w, result) } diff --git a/datanode/space_manager.go b/datanode/space_manager.go index c5c0be5279..d3de1a3d4f 100644 --- a/datanode/space_manager.go +++ b/datanode/space_manager.go @@ -16,15 +16,18 @@ package datanode import ( "fmt" + "path" + "strconv" "sync" "time" + "math" + "os" + "github.com/chubaofs/chubaofs/proto" "github.com/chubaofs/chubaofs/raftstore" "github.com/chubaofs/chubaofs/util" "github.com/chubaofs/chubaofs/util/log" - "math" - "os" ) // SpaceManager manages the disk space. @@ -300,6 +303,51 @@ func (manager *SpaceManager) DeletePartition(dpID uint64) { os.RemoveAll(dp.Path()) } +// ExpiredPartition marks specified partition as expired. +// It renames data path to a new name which add 'expired_' as prefix and operation timestamp as suffix. +// (e.g. '/disk0/datapartition_1_128849018880' to '/disk0/deleted_datapartition_1_128849018880_1600054521') +func (manager *SpaceManager) ExpiredPartition(partitionID uint64) { + dp := manager.Partition(partitionID) + if dp == nil { + return + } + manager.partitionMutex.Lock() + delete(manager.partitions, partitionID) + manager.partitionMutex.Unlock() + dp.Stop() + dp.Disk().DetachDataPartition(dp) + var currentPath = path.Clean(dp.Path()) + var newPath = path.Join(path.Dir(currentPath), + ExpiredPartitionPrefix+path.Base(currentPath)+"_"+strconv.FormatInt(time.Now().Unix(), 10)) + if err := os.Rename(currentPath, newPath); err != nil { + log.LogErrorf("ExpiredPartition: mark expired partition fail: volume(%v) partitionID(%v) path(%v) newPath(%v) err(%v)", + dp.volumeID, + dp.partitionID, + dp.path, + newPath, + err) + return + } + log.LogInfof("ExpiredPartition: mark expired partition: volume(%v) partitionID(%v) path(%v) newPath(%v)", + dp.volumeID, + dp.partitionID, + dp.path, + newPath) +} + +// DeletePartition deletes a partition from cache based on the partition id. +func (manager *SpaceManager) DeletePartitionFromCache(dpID uint64) { + dp := manager.Partition(dpID) + if dp == nil { + return + } + manager.partitionMutex.Lock() + delete(manager.partitions, dpID) + manager.partitionMutex.Unlock() + dp.Stop() + dp.Disk().DetachDataPartition(dp) +} + func (s *DataNode) buildHeartBeatResponse(response *proto.DataNodeHeartbeatResponse) { response.Status = proto.TaskSucceeds stat := s.space.Stats() diff --git a/datanode/wrap_operator.go b/datanode/wrap_operator.go index b96dd6d4ce..039982986a 100644 --- a/datanode/wrap_operator.go +++ b/datanode/wrap_operator.go @@ -249,7 +249,7 @@ func (s *DataNode) handlePacketToDeleteDataPartition(p *repl.Packet) { if err != nil { return } else { - s.space.DeletePartition(request.PartitionId) + s.space.ExpiredPartition(request.PartitionId) } } else { err = fmt.Errorf("illegal opcode ") @@ -939,7 +939,6 @@ func (s *DataNode) handlePacketToRemoveDataPartitionRaftMember(p *repl.Packet) { if err = decode.Decode(adminTask); err != nil { return } - reqData, err = json.Marshal(adminTask.Request) p.AddMesgLog(string(reqData)) if err != nil { @@ -948,7 +947,7 @@ func (s *DataNode) handlePacketToRemoveDataPartitionRaftMember(p *repl.Packet) { if err = json.Unmarshal(reqData, req); err != nil { return } - + req.ReserveResource = adminTask.ReserveResource dp := s.space.Partition(req.PartitionId) if dp == nil { return diff --git a/docker/conf/datanode.json b/docker/conf/datanode.json index b527a88037..1dd52993dc 100644 --- a/docker/conf/datanode.json +++ b/docker/conf/datanode.json @@ -7,7 +7,8 @@ "raftDir": "/cfs/log", "consulAddr": "http://192.168.0.101:8500", "exporterPort": 9500, - "cell": "cell-01", + "tickIntervalMs": 500, + "zoneName": "zone-01", "logDir": "/cfs/log", "logLevel": "info", "disks": [ diff --git a/docker/conf/datanode2.json b/docker/conf/datanode2.json new file mode 100644 index 0000000000..5d06c68dae --- /dev/null +++ b/docker/conf/datanode2.json @@ -0,0 +1,22 @@ +{ + "role": "datanode", + "listen": "17310", + "prof": "17320", + "raftHeartbeat": "17330", + "raftReplica": "17340", + "raftDir": "/cfs/log", + "consulAddr": "http://192.168.0.101:8500", + "exporterPort": 9500, + "tickIntervalMs": 500, + "zoneName": "zone-02", + "logDir": "/cfs/log", + "logLevel": "info", + "disks": [ + "/cfs/disk:10737418240" + ], + "masterAddr": [ + "192.168.0.11:17010", + "192.168.0.12:17010", + "192.168.0.13:17010" + ] +} diff --git a/docker/conf/datanode3.json b/docker/conf/datanode3.json new file mode 100644 index 0000000000..f91e84db1b --- /dev/null +++ b/docker/conf/datanode3.json @@ -0,0 +1,22 @@ +{ + "role": "datanode", + "listen": "17310", + "prof": "17320", + "raftHeartbeat": "17330", + "raftReplica": "17340", + "raftDir": "/cfs/log", + "consulAddr": "http://192.168.0.101:8500", + "exporterPort": 9500, + "tickIntervalMs": 500, + "zoneName": "zone-03", + "logDir": "/cfs/log", + "logLevel": "info", + "disks": [ + "/cfs/disk:10737418240" + ], + "masterAddr": [ + "192.168.0.11:17010", + "192.168.0.12:17010", + "192.168.0.13:17010" + ] +} diff --git a/docker/conf/metanode.json b/docker/conf/metanode.json index 4b2e5de0c7..f34b3ff503 100644 --- a/docker/conf/metanode.json +++ b/docker/conf/metanode.json @@ -6,9 +6,11 @@ "raftReplicaPort": "17240", "consulAddr": "http://192.168.0.101:8500", "exporterPort": 9500, + "tickIntervalMs": 500, "logLevel": "info", "logDir": "/cfs/log", "warnLogDir": "/cfs/log", + "zoneName": "zone-01", "totalMem": "536870912", "metadataDir": "/cfs/data/meta", "raftDir": "/cfs/data/raft", diff --git a/docker/conf/metanode2.json b/docker/conf/metanode2.json new file mode 100644 index 0000000000..e1bd8f69c9 --- /dev/null +++ b/docker/conf/metanode2.json @@ -0,0 +1,22 @@ +{ + "role": "metanode", + "listen": "17210", + "prof": "17220", + "raftHeartbeatPort": "17230", + "raftReplicaPort": "17240", + "consulAddr": "http://192.168.0.101:8500", + "exporterPort": 9500, + "tickIntervalMs": 500, + "logLevel": "info", + "logDir": "/cfs/log", + "warnLogDir": "/cfs/log", + "zoneName": "zone-02", + "totalMem": "536870912", + "metadataDir": "/cfs/data/meta", + "raftDir": "/cfs/data/raft", + "masterAddr": [ + "192.168.0.11:17010", + "192.168.0.12:17010", + "192.168.0.13:17010" + ] +} diff --git a/docker/conf/metanode3.json b/docker/conf/metanode3.json new file mode 100644 index 0000000000..93be973434 --- /dev/null +++ b/docker/conf/metanode3.json @@ -0,0 +1,22 @@ +{ + "role": "metanode", + "listen": "17210", + "prof": "17220", + "raftHeartbeatPort": "17230", + "raftReplicaPort": "17240", + "consulAddr": "http://192.168.0.101:8500", + "exporterPort": 9500, + "tickIntervalMs": 500, + "logLevel": "info", + "logDir": "/cfs/log", + "warnLogDir": "/cfs/log", + "zoneName": "zone-03", + "totalMem": "536870912", + "metadataDir": "/cfs/data/meta", + "raftDir": "/cfs/data/raft", + "masterAddr": [ + "192.168.0.11:17010", + "192.168.0.12:17010", + "192.168.0.13:17010" + ] +} diff --git a/docker/docker-compose-multi-zone.yaml b/docker/docker-compose-multi-zone.yaml new file mode 100644 index 0000000000..02f7894be1 --- /dev/null +++ b/docker/docker-compose-multi-zone.yaml @@ -0,0 +1,572 @@ +version: '2.1' + +networks: + extnetwork: + ipam: + config: + - subnet: 192.168.0.0/24 + gateway: 192.168.0.1 + +services: + monitor: + image: chubaofs/cfs-base:1.1 + depends_on: + - consul + - prometheus + - grafana + networks: + extnetwork: + + servers: + image: chubaofs/cfs-base:1.1 + depends_on: + - master1 + - master2 + - master3 + - metanode1 + - metanode2 + - metanode3 + - metanode4 + - metanode5 + - metanode6 + - metanode7 + - datanode1 + - datanode2 + - datanode3 + - datanode4 + - datanode5 + - datanode6 + - datanode7 + - objectnode1 + - objectnode2 + - objectnode3 + - console1 + - nginx + networks: + extnetwork: + + master1: + image: chubaofs/cfs-base:1.1 + ports: + - "5901" + - "5902" + - "17010" + - "17020" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/master1/data:/cfs/data + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/master1/log:/cfs/log + - ./conf/master1.json:/cfs/conf/master.json + - ./script/start_master.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.11 + + master2: + image: chubaofs/cfs-base:1.1 + ports: + - "5901" + - "5902" + - "17010" + - "17020" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/master2/data:/cfs/data + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/master2/log:/cfs/log + - ./conf/master2.json:/cfs/conf/master.json + - ./script/start_master.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.12 + master3: + image: chubaofs/cfs-base:1.1 + ports: + - "5901" + - "5902" + - "17010" + - "17020" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/master3/data:/cfs/data + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/master3/log:/cfs/log + - ./conf/master3.json:/cfs/conf/master.json + - ./script/start_master.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.13 + + metanode1: + image: chubaofs/cfs-base:1.1 + ports: + - "17210" + - "17220" + - "17230" + - "17240" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/metanode1/data:/cfs/data + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/metanode1/log:/cfs/log + - ./conf/metanode.json:/cfs/conf/metanode.json + - ./script/start_meta.sh:/cfs/script/start.sh + command: /bin/bash /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.21 + + metanode2: + image: chubaofs/cfs-base:1.1 + ports: + - "17210" + - "17220" + - "17230" + - "17240" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/metanode2/data:/cfs/data + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/metanode2/log:/cfs/log + - ./conf/metanode.json:/cfs/conf/metanode.json + - ./script/start_meta.sh:/cfs/script/start.sh + command: /bin/bash /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.22 + + metanode3: + image: chubaofs/cfs-base:1.1 + ports: + - "17210" + - "17220" + - "17230" + - "17240" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/metanode3/data:/cfs/data + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/metanode3/log:/cfs/log + - ./conf/metanode.json:/cfs/conf/metanode.json + - ./script/start_meta.sh:/cfs/script/start.sh + command: /bin/bash /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.23 + + metanode4: + image: chubaofs/cfs-base:1.1 + ports: + - "17210" + - "17220" + - "17230" + - "17240" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/metanode4/data:/cfs/data + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/metanode4/log:/cfs/log + - ./conf/metanode2.json:/cfs/conf/metanode.json + - ./script/start_meta.sh:/cfs/script/start.sh + command: /bin/bash /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.24 + + metanode5: + image: chubaofs/cfs-base:1.1 + ports: + - "17210" + - "17220" + - "17230" + - "17240" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/metanode5/data:/cfs/data + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/metanode5/log:/cfs/log + - ./conf/metanode2.json:/cfs/conf/metanode.json + - ./script/start_meta.sh:/cfs/script/start.sh + command: /bin/bash /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.25 + + metanode6: + image: chubaofs/cfs-base:1.1 + ports: + - "17210" + - "17220" + - "17230" + - "17240" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/metanode6/data:/cfs/data + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/metanode6/log:/cfs/log + - ./conf/metanode3.json:/cfs/conf/metanode.json + - ./script/start_meta.sh:/cfs/script/start.sh + command: /bin/bash /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.26 + + metanode7: + image: chubaofs/cfs-base:1.1 + ports: + - "17210" + - "17220" + - "17230" + - "17240" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/metanode7/data:/cfs/data + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/metanode7/log:/cfs/log + - ./conf/metanode3.json:/cfs/conf/metanode.json + - ./script/start_meta.sh:/cfs/script/start.sh + command: /bin/bash /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.27 + + datanode1: + image: chubaofs/cfs-base:1.1 + ports: + - "17310" + - "17320" + - "17330" + - "17340" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/datanode1/disk:/cfs/disk + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/datanode1/log:/cfs/log + - ./conf/datanode.json:/cfs/conf/datanode.json + - ./script/start_datanode.sh:/cfs/script/start.sh + command: /bin/bash /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.31 + + datanode2: + image: chubaofs/cfs-base:1.1 + ports: + - "17310" + - "17320" + - "17330" + - "17340" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/datanode2/disk:/cfs/disk + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/datanode2/log:/cfs/log + - ./conf/datanode.json:/cfs/conf/datanode.json + - ./script/start_datanode.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.32 + + datanode3: + image: chubaofs/cfs-base:1.1 + ports: + - "17310" + - "17320" + - "17330" + - "17340" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/datanode3/disk:/cfs/disk + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/datanode3/log:/cfs/log + - ./conf/datanode.json:/cfs/conf/datanode.json + - ./script/start_datanode.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.33 + + datanode4: + image: chubaofs/cfs-base:1.1 + ports: + - "17310" + - "17320" + - "17330" + - "17340" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/datanode4/disk:/cfs/disk + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/datanode4/log:/cfs/log + - ./conf/datanode2.json:/cfs/conf/datanode.json + - ./script/start_datanode.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.34 + + datanode5: + image: chubaofs/cfs-base:1.1 + ports: + - "17310" + - "17320" + - "17330" + - "17340" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/datanode5/disk:/cfs/disk + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/datanode5/log:/cfs/log + - ./conf/datanode2.json:/cfs/conf/datanode.json + - ./script/start_datanode.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.35 + + datanode6: + image: chubaofs/cfs-base:1.1 + ports: + - "17310" + - "17320" + - "17330" + - "17340" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/datanode6/disk:/cfs/disk + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/datanode6/log:/cfs/log + - ./conf/datanode3.json:/cfs/conf/datanode.json + - ./script/start_datanode.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.36 + + datanode7: + image: chubaofs/cfs-base:1.1 + ports: + - "17310" + - "17320" + - "17330" + - "17340" + - 9500 + volumes: + - ${DiskPath:-./docker_data}/datanode7/disk:/cfs/disk + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/datanode7/log:/cfs/log + - ./conf/datanode3.json:/cfs/conf/datanode.json + - ./script/start_datanode.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.37 + + objectnode1: + image: chubaofs/cfs-base:1.1 + ports: + - "80" + - 9500 + volumes: + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/objectnode1/log:/cfs/log + - ./conf/objectnode.json:/cfs/conf/objectnode.json + - ./script/start_objectnode.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + environment: + - TZ=Asia/Shanghai + networks: + extnetwork: + ipv4_address: 192.168.0.41 + + objectnode2: + image: chubaofs/cfs-base:1.1 + ports: + - "80" + - 9500 + volumes: + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/objectnode2/log:/cfs/log + - ./conf/objectnode.json:/cfs/conf/objectnode.json + - ./script/start_objectnode.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + environment: + - TZ=Asia/Shanghai + networks: + extnetwork: + ipv4_address: 192.168.0.42 + + objectnode3: + image: chubaofs/cfs-base:1.1 + ports: + - "80" + - 9500 + volumes: + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/objectnode3/log:/cfs/log + - ./conf/objectnode.json:/cfs/conf/objectnode.json + - ./script/start_objectnode.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + environment: + - TZ=Asia/Shanghai + networks: + extnetwork: + ipv4_address: 192.168.0.43 + + + console1: + image: chubaofs/cfs-base:1.1 + ports: + - "80" + volumes: + - ./bin:/cfs/bin:ro + - ${DiskPath:-./docker_data}/console/log:/cfs/log + - ./conf/console.json:/cfs/conf/console.json + - ./script/start_console.sh:/cfs/script/start.sh + command: /bin/sh /cfs/script/start.sh + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.50 + + client: + image: chubaofs/cfs-base:1.1 + ports: + - 9500 + volumes: + - ./bin:/cfs/bin:ro + - ./conf/hosts:/etc/hosts:ro + - ./conf/client.json:/cfs/conf/client.json + - ${DiskPath:-./docker_data}/client/log:/cfs/log + - ./script/run_test.sh:/cfs/script/start.sh + - ./script/start_client.sh:/cfs/script/start_client.sh + - ./ltp/runtest/fs:/opt/ltp/runtest/fs + - ./s3tests:/opt/s3tests:ro + privileged: true + devices: + - /dev/fuse:/dev/fuse:rwm + cap_add: + - SYS_ADMIN + command: /bin/bash /cfs/script/start.sh + networks: + extnetwork: + + consul: + image: consul:1.5 + ports: + - 8500:8500 + volumes: + - ./monitor:/monitor + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.101 + + prometheus: + image: prom/prometheus + ports: + - 9090:9090 + volumes: + - ./monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.102 + + grafana: + image: grafana/grafana:6.4.4 + environment: + - GF_SECURITY_ADMIN_PASSWORD=123456 + ports: + - 3000:3000 + volumes: + - ./monitor/grafana/grafana.ini:/etc/grafana/grafna.ini + - ./monitor/grafana/provisioning:/etc/grafana/provisioning + - ./monitor/grafana/init.sh:/grafana/init.sh + privileged: true + #command: /bin/bash + networks: + extnetwork: + ipv4_address: 192.168.0.103 + + nginx: + image: nginx:1.17.8 + ports: + - "80:80" + volumes: + - ./conf/nginx.conf:/etc/nginx/nginx.conf:ro + command: /bin/bash -c "nginx -g 'daemon off;'" + restart: on-failure + privileged: true + networks: + extnetwork: + ipv4_address: 192.168.0.104 + + build: + image: chubaofs/cfs-base:1.1 + volumes: + - ../:/go/src/github.com/chubaofs/chubaofs + command: + /bin/bash /go/src/github.com/chubaofs/chubaofs/docker/script/build.sh + networks: + extnetwork: + + unit_test: + image: chubaofs/cfs-base:1.1 + volumes: + - ../:/go/src/github.com/chubaofs/chubaofs + command: + - bash + - "-c" + - >- + set -e; + mkdir -p /go/src/github.com/chubaofs/chubaofs/docker/bin && + cd /go/src/github.com/chubaofs/chubaofs && make test + networks: + extnetwork: diff --git a/docker/script/run_test.sh b/docker/script/run_test.sh index 8455abe26c..7906346929 100755 --- a/docker/script/run_test.sh +++ b/docker/script/run_test.sh @@ -31,8 +31,10 @@ AuthKey="0e20229116d5a9a4a9e876806b514a85" init_cli() { cp ${cli} /usr/bin/ cd ${conf_path} - ${cli} completion + ${cli} completion &> /dev/null echo 'source '${conf_path}'/cfs-cli.sh' >> ~/.bashrc + echo -n "cli tool ... " + echo -e "\033[32mdone\033[0m" } check_cluster() { echo -n "Checking cluster ... " @@ -55,7 +57,7 @@ ensure_node_writable() { for i in $(seq 1 300) ; do ${cli} ${node} list &> /tmp/cli_${node}_list; res=`cat /tmp/cli_${node}_list | grep "Yes" | grep "Active" | wc -l` - if [[ ${res} -eq 4 ]]; then + if [[ ${res} -ge 4 ]]; then echo -e "\033[32mdone\033[0m" return fi @@ -95,7 +97,7 @@ create_volume() { echo -e "\033[32mdone\033[0m" return fi - ${cli} volume create ${VolName} ${Owner} --capacity=30 -y > /dev/null + ${cli} volume create ${VolName} ${Owner} --zonename=zone-01 --capacity=30 -y > /dev/null if [[ $? -ne 0 ]]; then echo -e "\033[31mfail\033[0m" exit 1 @@ -146,7 +148,7 @@ print_error_info() { start_client() { echo -n "Starting client ... " nohup /cfs/bin/cfs-client -c /cfs/conf/client.json >/cfs/log/cfs.out 2>&1 & - sleep 10 + sleep 5 res=$( stat $MntPoint | grep -q "Inode: 1" ; echo $? ) if [[ $res -ne 0 ]] ; then echo -e "\033[31mfail\033[0m" @@ -269,6 +271,6 @@ add_data_partitions ; sleep 3 show_cluster_info start_client ; sleep 2 run_ltptest -run_s3_test +#run_s3_test stop_client delete_volume diff --git a/docker/script/start_client.sh b/docker/script/start_client.sh index beab6dfd6d..3ca606d50c 100755 --- a/docker/script/start_client.sh +++ b/docker/script/start_client.sh @@ -77,7 +77,7 @@ ensure_node_writable() { for i in $(seq 1 300) ; do ${cli} ${node} list &> /tmp/cli_${node}_list; res=`cat /tmp/cli_${node}_list | grep "Yes" | grep "Active" | wc -l` - if [[ ${res} -eq 4 ]]; then + if [[ ${res} -ge 3 ]]; then echo -e "\033[32mdone\033[0m" return fi @@ -96,7 +96,7 @@ create_volume() { echo -e "\033[32mdone\033[0m" return fi - ${cli} volume create ${VolName} ${Owner} --capacity=30 -y > /dev/null + ${cli} volume create ${VolName} ${Owner} --zonename=zone-01 --capacity=30 -y > /dev/null if [[ $? -ne 0 ]]; then echo -e "\033[31mfail\033[0m" exit 1 diff --git a/master/api_service.go b/master/api_service.go index ba0c0e4551..a6561f5a2c 100644 --- a/master/api_service.go +++ b/master/api_service.go @@ -202,19 +202,23 @@ func (m *Server) clusterStat(w http.ResponseWriter, r *http.Request) { func (m *Server) getCluster(w http.ResponseWriter, r *http.Request) { cv := &proto.ClusterView{ - Name: m.cluster.Name, - LeaderAddr: m.leaderInfo.addr, - DisableAutoAlloc: m.cluster.DisableAutoAllocate, - MetaNodeThreshold: m.cluster.cfg.MetaNodeThreshold, - Applied: m.fsm.applied, - MaxDataPartitionID: m.cluster.idAlloc.dataPartitionID, - MaxMetaNodeID: m.cluster.idAlloc.commonID, - MaxMetaPartitionID: m.cluster.idAlloc.metaPartitionID, - MetaNodes: make([]proto.NodeView, 0), - DataNodes: make([]proto.NodeView, 0), - VolStatInfo: make([]*proto.VolStatInfo, 0), - BadPartitionIDs: make([]proto.BadPartitionView, 0), - BadMetaPartitionIDs: make([]proto.BadPartitionView, 0), + Name: m.cluster.Name, + LeaderAddr: m.leaderInfo.addr, + DisableAutoAlloc: m.cluster.DisableAutoAllocate, + MetaNodeThreshold: m.cluster.cfg.MetaNodeThreshold, + DpRecoverPool: m.cluster.cfg.DataPartitionsRecoverPoolSize, + MpRecoverPool: m.cluster.cfg.MetaPartitionsRecoverPoolSize, + Applied: m.fsm.applied, + MaxDataPartitionID: m.cluster.idAlloc.dataPartitionID, + MaxMetaNodeID: m.cluster.idAlloc.commonID, + MaxMetaPartitionID: m.cluster.idAlloc.metaPartitionID, + MetaNodes: make([]proto.NodeView, 0), + DataNodes: make([]proto.NodeView, 0), + VolStatInfo: make([]*proto.VolStatInfo, 0), + BadPartitionIDs: make([]proto.BadPartitionView, 0), + BadMetaPartitionIDs: make([]proto.BadPartitionView, 0), + MigratedDataPartitions: make([]proto.BadPartitionView, 0), + MigratedMetaPartitions: make([]proto.BadPartitionView, 0), } vols := m.cluster.allVolNames() @@ -230,8 +234,35 @@ func (m *Server) getCluster(w http.ResponseWriter, r *http.Request) { } cv.VolStatInfo = append(cv.VolStatInfo, stat.(*volStatInfo)) } - cv.BadPartitionIDs = m.cluster.getBadDataPartitionsView() - cv.BadMetaPartitionIDs = m.cluster.getBadMetaPartitionsView() + + m.cluster.BadDataPartitionIds.Range(func(key, value interface{}) bool { + badDataPartitionIds := value.([]uint64) + path := key.(string) + bpv := badPartitionView{Path: path, PartitionIDs: badDataPartitionIds} + cv.BadPartitionIDs = append(cv.BadPartitionIDs, bpv) + return true + }) + m.cluster.BadMetaPartitionIds.Range(func(key, value interface{}) bool { + badPartitionIds := value.([]uint64) + path := key.(string) + bpv := badPartitionView{Path: path, PartitionIDs: badPartitionIds} + cv.BadMetaPartitionIDs = append(cv.BadMetaPartitionIDs, bpv) + return true + }) + m.cluster.MigratedDataPartitionIds.Range(func(key, value interface{}) bool { + badPartitionIds := value.([]uint64) + path := key.(string) + bpv := badPartitionView{Path: path, PartitionIDs: badPartitionIds} + cv.MigratedDataPartitions = append(cv.MigratedDataPartitions, bpv) + return true + }) + m.cluster.MigratedMetaPartitionIds.Range(func(key, value interface{}) bool { + badPartitionIds := value.([]uint64) + path := key.(string) + bpv := badPartitionView{Path: path, PartitionIDs: badPartitionIds} + cv.MigratedMetaPartitions = append(cv.MigratedMetaPartitions, bpv) + return true + }) sendOkReply(w, r, newSuccessHTTPReply(cv)) } @@ -392,6 +423,47 @@ func (m *Server) addDataReplica(w http.ResponseWriter, r *http.Request) { sendOkReply(w, r, newSuccessHTTPReply(msg)) } +func (m *Server) resetDataPartitionHosts(w http.ResponseWriter, r *http.Request) { + var ( + msg string + addr string + dp *DataPartition + partitionID uint64 + err error + ) + if partitionID, addr, err = parseRequestToRemoveDataReplica(r); err != nil { + sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()}) + return + } + + if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil { + sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists)) + return + } + + hosts := make([]string, 0) + peers := make([]proto.Peer, 0) + + for _, host := range dp.Hosts { + if host == addr { + continue + } + hosts = append(hosts, host) + } + for _, peer := range dp.Peers { + if peer.Addr == addr { + continue + } + peers = append(peers, peer) + } + + if err = dp.update("resetDataPartitionHosts", dp.VolName, peers, hosts, m.cluster); err != nil { + return + } + msg = fmt.Sprintf("data partitionID :%v reset hosts [%v] successfully", partitionID, addr) + sendOkReply(w, r, newSuccessHTTPReply(msg)) +} + func (m *Server) deleteDataReplica(w http.ResponseWriter, r *http.Request) { var ( msg string @@ -410,8 +482,9 @@ func (m *Server) deleteDataReplica(w http.ResponseWriter, r *http.Request) { sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists)) return } - - if err = m.cluster.removeDataReplica(dp, addr, true); err != nil { + dp.offlineMutex.Lock() + defer dp.offlineMutex.Unlock() + if err = m.cluster.removeDataReplica(dp, addr, true, false); err != nil { sendErrReply(w, r, newErrHTTPReply(err)) return } @@ -466,8 +539,9 @@ func (m *Server) deleteMetaReplica(w http.ResponseWriter, r *http.Request) { sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists)) return } - - if err = m.cluster.deleteMetaReplica(mp, addr, true); err != nil { + mp.offlineMutex.Lock() + defer mp.offlineMutex.Unlock() + if err = m.cluster.deleteMetaReplica(mp, addr, true, false); err != nil { sendErrReply(w, r, newErrHTTPReply(err)) return } @@ -494,7 +568,7 @@ func (m *Server) decommissionDataPartition(w http.ResponseWriter, r *http.Reques sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists)) return } - if err = m.cluster.decommissionDataPartition(addr, dp, handleDataPartitionOfflineErr); err != nil { + if err = m.cluster.decommissionDataPartition(addr, dp, getTargetAddressForDataPartitionDecommission, handleDataPartitionOfflineErr, "", false); err != nil { sendErrReply(w, r, newErrHTTPReply(err)) return } @@ -502,6 +576,63 @@ func (m *Server) decommissionDataPartition(w http.ResponseWriter, r *http.Reques sendOkReply(w, r, newSuccessHTTPReply(rstMsg)) } +func (m *Server) setNodeToOfflineState(w http.ResponseWriter, r *http.Request) { + var ( + err error + startID uint64 + endID uint64 + nodeType string + zoneName string + state bool + ) + if startID, endID, nodeType, zoneName, state, err = parseRequestToSetNodeToOfflineState(r); err != nil { + sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()}) + return + } + + if nodeType == nodeTypeAll { + m.cluster.setDataNodeToOfflineState(startID, endID, state, zoneName) + m.cluster.setMetaNodeToOfflineState(startID, endID, state, zoneName) + } else { + if nodeType == nodeTypeDataNode { + m.cluster.setDataNodeToOfflineState(startID, endID, state, zoneName) + } else { + m.cluster.setMetaNodeToOfflineState(startID, endID, state, zoneName) + } + } + sendOkReply(w, r, newSuccessHTTPReply("success")) +} + +func parseRequestToSetNodeToOfflineState(r *http.Request) (startID, endID uint64, nodeType, zoneName string, state bool, err error) { + var value string + if value = r.FormValue(startKey); value == "" { + err = keyNotFound(startKey) + return + } + startID, err = strconv.ParseUint(value, 10, 64) + if err != nil { + return + } + if value = r.FormValue(endKey); value == "" { + err = keyNotFound(endKey) + return + } + endID, err = strconv.ParseUint(value, 10, 64) + if err != nil { + return + } + nodeType = r.FormValue(nodeTypeKey) + if !(nodeType == nodeTypeDataNode || nodeType == nodeTypeMetaNode || nodeType == nodeTypeAll) { + err = fmt.Errorf("nodeType must be dataNode or metaNode or all") + return + } + if zoneName, err = extractZoneName(r); err != nil { + return + } + state, err = strconv.ParseBool(r.FormValue(stateKey)) + return +} + func (m *Server) diagnoseDataPartition(w http.ResponseWriter, r *http.Request) { var ( err error @@ -569,23 +700,26 @@ func (m *Server) markDeleteVol(w http.ResponseWriter, r *http.Request) { func (m *Server) updateVol(w http.ResponseWriter, r *http.Request) { var ( - name string - authKey string - err error - msg string - capacity uint64 - replicaNum int - followerRead bool - authenticate bool - enableToken bool - zoneName string - description string - dpSelectorName string - dpSelectorParm string - vol *Vol + name string + authKey string + err error + msg string + capacity int + replicaNum int + followerRead bool + authenticate bool + enableToken bool + autoRepair bool + zoneName string + description string + vol *Vol ) - - if name, authKey, description, err = parseRequestToUpdateVol(r); err != nil { + if name, authKey, replicaNum, err = parseRequestToUpdateVol(r); err != nil { + sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()}) + return + } + if replicaNum != 0 && !(replicaNum == 2 || replicaNum == 3) { + err = fmt.Errorf("replicaNum can only be 2 and 3,received replicaNum is[%v]", replicaNum) sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()}) return } @@ -593,34 +727,18 @@ func (m *Server) updateVol(w http.ResponseWriter, r *http.Request) { sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()}) return } - if zoneName, capacity, replicaNum, enableToken, dpSelectorName, dpSelectorParm, err = - parseDefaultInfoToUpdateVol(r, vol); err != nil { + if zoneName, capacity, description, err = parseDefaultInfoToUpdateVol(r, vol); err != nil { sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()}) return } - if replicaNum != 0 && !(replicaNum == 2 || replicaNum == 3) { - err = fmt.Errorf("replicaNum can only be 2 and 3,received replicaNum is[%v]", replicaNum) - sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()}) - return + if replicaNum == 0 { + replicaNum = int(vol.dpReplicaNum) } - - if followerRead, authenticate, err = parseBoolFieldToUpdateVol(r, vol); err != nil { + if followerRead, authenticate, enableToken, autoRepair, err = parseBoolFieldToUpdateVol(r, vol); err != nil { sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()}) return } - - newArgs := getVolVarargs(vol) - - newArgs.zoneName = zoneName - newArgs.description = description - newArgs.capacity = capacity - newArgs.followerRead = followerRead - newArgs.authenticate = authenticate - newArgs.enableToken = enableToken - newArgs.dpSelectorName = dpSelectorName - newArgs.dpSelectorParm = dpSelectorParm - - if err = m.cluster.updateVol(name, authKey, newArgs); err != nil { + if err = m.cluster.updateVol(name, authKey, zoneName, description, uint64(capacity), uint8(replicaNum), followerRead, authenticate, enableToken, autoRepair); err != nil { sendErrReply(w, r, newErrHTTPReply(err)) return } @@ -651,14 +769,12 @@ func (m *Server) volExpand(w http.ResponseWriter, r *http.Request) { return } - newArgs := getVolVarargs(vol) - newArgs.capacity = uint64(capacity) - - if err = m.cluster.updateVol(name, authKey, newArgs); err != nil { + if err = m.cluster.updateVol(name, authKey, vol.zoneName,vol.description,uint64(capacity), + vol.dpReplicaNum,vol.FollowerRead,vol.authenticate,vol.enableToken,vol.autoRepair); err != nil { sendErrReply(w, r, newErrHTTPReply(err)) return } - msg = fmt.Sprintf("update vol[%v] successfully\n", name) + msg = fmt.Sprintf("expand vol[%v] successfully\n", name) sendOkReply(w, r, newSuccessHTTPReply(msg)) } @@ -684,18 +800,14 @@ func (m *Server) volShrink(w http.ResponseWriter, r *http.Request) { sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()}) return } - - newArgs := getVolVarargs(vol) - newArgs.capacity = uint64(capacity) - - if err = m.cluster.updateVol(name, authKey, newArgs); err != nil { + if err = m.cluster.updateVol(name, authKey, vol.zoneName,vol.description,uint64(capacity), + vol.dpReplicaNum,vol.FollowerRead,vol.authenticate,vol.enableToken,vol.autoRepair); err != nil { sendErrReply(w, r, newErrHTTPReply(err)) return } - msg = fmt.Sprintf("update vol[%v] successfully\n", name) + msg = fmt.Sprintf("shrink vol[%v] successfully\n", name) sendOkReply(w, r, newSuccessHTTPReply(msg)) } - func (m *Server) createVol(w http.ResponseWriter, r *http.Request) { var ( name string @@ -709,13 +821,13 @@ func (m *Server) createVol(w http.ResponseWriter, r *http.Request) { vol *Vol followerRead bool authenticate bool - crossZone bool enableToken bool + autoRepair bool zoneName string description string ) - if name, owner, zoneName, description, mpCount, dpReplicaNum, size, capacity, followerRead, authenticate, crossZone, enableToken, err = parseRequestToCreateVol(r); err != nil { + if name, owner, zoneName, description, mpCount, dpReplicaNum, size, capacity, followerRead, authenticate, enableToken, autoRepair, err = parseRequestToCreateVol(r); err != nil { sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()}) return } @@ -724,7 +836,7 @@ func (m *Server) createVol(w http.ResponseWriter, r *http.Request) { sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()}) return } - if vol, err = m.cluster.createVol(name, owner, zoneName, description, mpCount, dpReplicaNum, size, capacity, followerRead, authenticate, crossZone, enableToken); err != nil { + if vol, err = m.cluster.createVol(name, owner, zoneName, description, mpCount, dpReplicaNum, size, capacity, followerRead, authenticate, enableToken, autoRepair); err != nil { sendErrReply(w, r, newErrHTTPReply(err)) return } @@ -781,8 +893,9 @@ func newSimpleView(vol *Vol) *proto.SimpleVolView { FollowerRead: vol.FollowerRead, NeedToLowerReplica: vol.NeedToLowerReplica, Authenticate: vol.authenticate, - CrossZone: vol.crossZone, EnableToken: vol.enableToken, + CrossZone: vol.crossZone, + AutoRepair: vol.autoRepair, Tokens: vol.tokens, RwDpCnt: vol.dataPartitions.readableAndWritableCnt, MpCnt: len(vol.MetaPartitions), @@ -847,6 +960,8 @@ func (m *Server) getDataNode(w http.ResponseWriter, r *http.Request) { NodeSetID: dataNode.NodeSetID, PersistenceDataPartitions: dataNode.PersistenceDataPartitions, BadDisks: dataNode.BadDisks, + ToBeOffline: dataNode.ToBeOffline, + ToBeMigrated: dataNode.ToBeMigrated, } sendOkReply(w, r, newSuccessHTTPReply(dataNodeInfo)) @@ -855,13 +970,15 @@ func (m *Server) getDataNode(w http.ResponseWriter, r *http.Request) { // Decommission a data node. This will decommission all the data partition on that node. func (m *Server) decommissionDataNode(w http.ResponseWriter, r *http.Request) { var ( - node *DataNode - rstMsg string - offLineAddr string - err error + node *DataNode + rstMsg string + offLineAddr string + destZoneName string + strictFlag bool + err error ) - if offLineAddr, err = parseAndExtractNodeAddr(r); err != nil { + if offLineAddr, destZoneName, err = parseRequestForDecommissionDataNode(r); err != nil { sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()}) return } @@ -870,7 +987,13 @@ func (m *Server) decommissionDataNode(w http.ResponseWriter, r *http.Request) { sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists)) return } - if err = m.cluster.decommissionDataNode(node); err != nil { + + if strictFlag, err = extractStrictFlag(r); err != nil { + sendErrReply(w, r, newErrHTTPReply(err)) + return + } + + if err = m.cluster.decommissionDataNode(node, destZoneName, strictFlag); err != nil { sendErrReply(w, r, newErrHTTPReply(err)) return } @@ -922,6 +1045,23 @@ func (m *Server) setNodeInfoHandler(w http.ResponseWriter, r *http.Request) { } } } + if val, ok := params[dpRecoverPoolSizeKey]; ok { + if v, ok := val.(int64); ok { + if err = m.cluster.setDpRecoverPoolSize(int32(v)); err != nil { + sendErrReply(w, r, newErrHTTPReply(err)) + return + } + } + } + if val, ok := params[mpRecoverPoolSizeKey]; ok { + if v, ok := val.(int64); ok { + if err = m.cluster.setMpRecoverPoolSize(int32(v)); err != nil { + sendErrReply(w, r, newErrHTTPReply(err)) + return + } + } + } + sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set nodeinfo params %v successfully", params))) } @@ -1115,6 +1255,8 @@ func (m *Server) getMetaNode(w http.ResponseWriter, r *http.Request) { MetaPartitionCount: metaNode.MetaPartitionCount, NodeSetID: metaNode.NodeSetID, PersistenceMetaPartitions: metaNode.PersistenceMetaPartitions, + ToBeOffline: metaNode.ToBeOffline, + ToBeMigrated: metaNode.ToBeMigrated, } sendOkReply(w, r, newSuccessHTTPReply(metaNodeInfo)) } @@ -1135,7 +1277,7 @@ func (m *Server) decommissionMetaPartition(w http.ResponseWriter, r *http.Reques sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists)) return } - if err = m.cluster.decommissionMetaPartition(nodeAddr, mp); err != nil { + if err = m.cluster.decommissionMetaPartition(nodeAddr, mp, getTargetAddressForMetaPartitionDecommission, false); err != nil { sendErrReply(w, r, newErrHTTPReply(err)) return } @@ -1171,6 +1313,7 @@ func (m *Server) decommissionMetaNode(w http.ResponseWriter, r *http.Request) { metaNode *MetaNode rstMsg string offLineAddr string + strictFlag bool err error ) @@ -1183,7 +1326,13 @@ func (m *Server) decommissionMetaNode(w http.ResponseWriter, r *http.Request) { sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaNodeNotExists)) return } - if err = m.cluster.decommissionMetaNode(metaNode); err != nil { + + if strictFlag, err = extractStrictFlag(r); err != nil { + sendErrReply(w, r, newErrHTTPReply(err)) + return + } + + if err = m.cluster.decommissionMetaNode(metaNode, strictFlag); err != nil { sendErrReply(w, r, newErrHTTPReply(err)) return } @@ -1288,6 +1437,17 @@ func parseRequestForAddNode(r *http.Request) (nodeAddr, zoneName string, err err return } +func parseRequestForDecommissionDataNode(r *http.Request) (nodeAddr, zoneName string, err error) { + if err = r.ParseForm(); err != nil { + return + } + if nodeAddr, err = extractNodeAddr(r); err != nil { + return + } + zoneName = r.FormValue(zoneNameKey) + return +} + func parseAndExtractNodeAddr(r *http.Request) (nodeAddr string, err error) { if err = r.ParseForm(); err != nil { return @@ -1295,6 +1455,15 @@ func parseAndExtractNodeAddr(r *http.Request) (nodeAddr string, err error) { return extractNodeAddr(r) } +func extractStrictFlag(r *http.Request) (strict bool, err error) { + var strictStr string + if strictStr = r.FormValue(strictFlagKey); strictStr == "" { + strictStr = "false" + return + } + return strconv.ParseBool(strictStr) +} + func parseRequestToDecommissionNode(r *http.Request) (nodeAddr, diskPath string, err error) { if err = r.ParseForm(); err != nil { return @@ -1380,7 +1549,7 @@ func parseRequestToDeleteVol(r *http.Request) (name, authKey string, err error) } -func parseRequestToUpdateVol(r *http.Request) (name, authKey, description string, err error) { +func parseRequestToUpdateVol(r *http.Request) (name, authKey string, replicaNum int, err error) { if err = r.ParseForm(); err != nil { return } @@ -1390,12 +1559,10 @@ func parseRequestToUpdateVol(r *http.Request) (name, authKey, description string if authKey, err = extractAuthKey(r); err != nil { return } - description = r.FormValue(descriptionKey) return } -func parseDefaultInfoToUpdateVol(r *http.Request, vol *Vol) (zoneName string, capacity uint64, replicaNum int, - enableToken bool, dpSelectorName string, dpSelectorParm string, err error) { +func parseDefaultInfoToUpdateVol(r *http.Request, vol *Vol) (zoneName string, capacity int, description string, err error) { if err = r.ParseForm(); err != nil { return } @@ -1403,45 +1570,20 @@ func parseDefaultInfoToUpdateVol(r *http.Request, vol *Vol) (zoneName string, ca zoneName = vol.zoneName } if capacityStr := r.FormValue(volCapacityKey); capacityStr != "" { - var capacityInt int - if capacityInt, err = strconv.Atoi(capacityStr); err != nil { + if capacity, err = strconv.Atoi(capacityStr); err != nil { err = unmatchedKey(volCapacityKey) return } - capacity = uint64(capacityInt) - } else { - capacity = vol.Capacity - } - if replicaNumStr := r.FormValue(replicaNumKey); replicaNumStr != "" { - if replicaNum, err = strconv.Atoi(replicaNumStr); err != nil { - err = unmatchedKey(replicaNumKey) - return - } - } else { - replicaNum = int(vol.dpReplicaNum) - } - if enableTokenStr := r.FormValue(enableTokenKey); enableTokenStr != "" { - if enableToken, err = strconv.ParseBool(enableTokenStr); err != nil { - err = unmatchedKey(enableTokenKey) - return - } } else { - enableToken = vol.enableToken + capacity = int(vol.Capacity) } - dpSelectorName = r.FormValue(dpSelectorNameKey) - dpSelectorParm = r.FormValue(dpSelectorParmKey) - if (dpSelectorName == "") || (dpSelectorParm == "") { - if (dpSelectorName != "") || (dpSelectorParm != "") { - err = keyNotFound(dpSelectorNameKey + " or " + dpSelectorParmKey) - return - } - dpSelectorName = vol.dpSelectorName - dpSelectorParm = vol.dpSelectorParm + if description = r.FormValue(descriptionKey); description == "" { + description = vol.description } return } -func parseBoolFieldToUpdateVol(r *http.Request, vol *Vol) (followerRead, authenticate bool, err error) { +func parseBoolFieldToUpdateVol(r *http.Request, vol *Vol) (followerRead, authenticate, enableToken, autoRepair bool, err error) { if followerReadStr := r.FormValue(followerReadKey); followerReadStr != "" { if followerRead, err = strconv.ParseBool(followerReadStr); err != nil { err = unmatchedKey(followerReadKey) @@ -1458,6 +1600,22 @@ func parseBoolFieldToUpdateVol(r *http.Request, vol *Vol) (followerRead, authent } else { authenticate = vol.authenticate } + if enableTokenStr := r.FormValue(enableTokenKey); enableTokenStr != "" { + if enableToken, err = strconv.ParseBool(enableTokenStr); err != nil { + err = unmatchedKey(enableTokenKey) + return + } + } else { + enableToken = vol.enableToken + } + if autoRepairStr := r.FormValue(autoRepairKey); autoRepairStr != "" { + if autoRepair, err = strconv.ParseBool(autoRepairStr); err != nil { + err = unmatchedKey(autoRepairKey) + return + } + } else { + autoRepair = vol.autoRepair + } return } @@ -1477,7 +1635,7 @@ func parseRequestToSetVolCapacity(r *http.Request) (name, authKey string, capaci return } -func parseRequestToCreateVol(r *http.Request) (name, owner, zoneName, description string, mpCount, dpReplicaNum, size, capacity int, followerRead, authenticate, crossZone, enableToken bool, err error) { +func parseRequestToCreateVol(r *http.Request) (name, owner, zoneName, description string, mpCount, dpReplicaNum, size, capacity int, followerRead, authenticate, enableToken, autoRepair bool, err error) { if err = r.ParseForm(); err != nil { return } @@ -1519,11 +1677,12 @@ func parseRequestToCreateVol(r *http.Request) (name, owner, zoneName, descriptio if authenticate, err = extractAuthenticate(r); err != nil { return } - - if crossZone, err = extractCrossZone(r); err != nil { + if autoRepair, err = extractAutoRepair(r); err != nil { return } - zoneName = r.FormValue(zoneNameKey) + if zoneName = r.FormValue(zoneNameKey); zoneName == "" { + zoneName = DefaultZoneName + } enableToken = extractEnableToken(r) description = r.FormValue(descriptionKey) return @@ -1713,13 +1872,13 @@ func extractAuthenticate(r *http.Request) (authenticate bool, err error) { return } -func extractCrossZone(r *http.Request) (crossZone bool, err error) { +func extractAutoRepair(r *http.Request) (autoRepair bool, err error) { var value string - if value = r.FormValue(crossZoneKey); value == "" { - crossZone = false + if value = r.FormValue(autoRepairKey); value == "" { + autoRepair = false return } - if crossZone, err = strconv.ParseBool(value); err != nil { + if autoRepair, err = strconv.ParseBool(value); err != nil { return } return @@ -1744,28 +1903,45 @@ func parseAndExtractSetNodeInfoParams(r *http.Request) (params map[string]interf if err = r.ParseForm(); err != nil { return } - var value string noParams := true params = make(map[string]interface{}) - if value = r.FormValue(nodeDeleteBatchCountKey); value != "" { - noParams = false - var batchCount = uint64(0) - batchCount, err = strconv.ParseUint(value, 10, 64) - if err != nil { - err = unmatchedKey(nodeDeleteBatchCountKey) - return - } - params[nodeDeleteBatchCountKey] = batchCount + if noParams, err = parseNodeInfoKey(params, nodeDeleteBatchCountKey, noParams, r); err != nil { + return + } + + if noParams, err = parseNodeInfoKey(params, nodeMarkDeleteRateKey, noParams, r); err != nil { + return + } + if noParams, err = parseNodeInfoKey(params, nodeDeleteWorkerSleepMs, noParams, r); err != nil { + return + } + if noParams, err = parseNodeInfoIntKey(params, dpRecoverPoolSizeKey, noParams, r); err != nil { + return + } + if noParams, err = parseNodeInfoIntKey(params, mpRecoverPoolSizeKey, noParams, r); err != nil { + return } - if value = r.FormValue(nodeMarkDeleteRateKey); value != "" { + if noParams { + err = keyNotFound(nodeDeleteBatchCountKey) + return + } + return +} + +func parseNodeInfoKey(params map[string]interface{}, key string, noParams bool, r *http.Request) (noPara bool, err error) { + var value string + defer func() { + noPara = noParams + }() + if value = r.FormValue(key); value != "" { noParams = false var val = uint64(0) val, err = strconv.ParseUint(value, 10, 64) if err != nil { - err = unmatchedKey(nodeMarkDeleteRateKey) + err = unmatchedKey(key) return } - params[nodeMarkDeleteRateKey] = val + params[key] = val } if value = r.FormValue(nodeAutoRepairRateKey); value != "" { @@ -1781,17 +1957,31 @@ func parseAndExtractSetNodeInfoParams(r *http.Request) (params map[string]interf if value = r.FormValue(nodeDeleteWorkerSleepMs); value != "" { noParams = false - var val = uint64(0) - val, err = strconv.ParseUint(value, 10, 64) + var val = int64(0) + val, err = strconv.ParseInt(value, 10, 64) if err != nil { - err = unmatchedKey(nodeMarkDeleteRateKey) + err = unmatchedKey(key) return } - params[nodeDeleteWorkerSleepMs] = val + params[key] = val } - if noParams { - err = keyNotFound(nodeDeleteBatchCountKey) - return + return +} + +func parseNodeInfoIntKey(params map[string]interface{}, key string, noParams bool, r *http.Request) (noPara bool, err error) { + var value string + defer func() { + noPara = noParams + }() + if value = r.FormValue(key); value != "" { + noParams = false + var val = int64(0) + val, err = strconv.ParseInt(value, 10, 64) + if err != nil { + err = unmatchedKey(key) + return + } + params[key] = val } return } @@ -2052,10 +2242,12 @@ func (m *Server) getMetaPartition(w http.ResponseWriter, r *http.Request) { } for i := 0; i < len(replicas); i++ { replicas[i] = &proto.MetaReplicaInfo{ - Addr: mp.Replicas[i].Addr, - ReportTime: mp.Replicas[i].ReportTime, - Status: mp.Replicas[i].Status, - IsLeader: mp.Replicas[i].IsLeader, + Addr: mp.Replicas[i].Addr, + ReportTime: mp.Replicas[i].ReportTime, + Status: mp.Replicas[i].Status, + IsLeader: mp.Replicas[i].IsLeader, + DentryCount: mp.Replicas[i].DentryCount, + InodeCount: mp.Replicas[i].InodeCount, } } var mpInfo = &proto.MetaPartitionInfo{ @@ -2155,6 +2347,14 @@ func parseAndExtractName(r *http.Request) (name string, err error) { return extractName(r) } +func extractZoneName(r *http.Request) (name string, err error) { + if name = r.FormValue(zoneNameKey); name == "" { + err = keyNotFound(zoneNameKey) + return + } + return +} + func extractName(r *http.Request) (name string, err error) { if name = r.FormValue(nameKey); name == "" { err = keyNotFound(nameKey) diff --git a/master/api_service_test.go b/master/api_service_test.go index 8ed93c1900..0c47729113 100644 --- a/master/api_service_test.go +++ b/master/api_service_test.go @@ -43,6 +43,10 @@ const ( mds3Addr = "127.0.0.1:9103" mds4Addr = "127.0.0.1:9104" mds5Addr = "127.0.0.1:9105" + mds6Addr = "127.0.0.1:9106" + mds7Addr = "127.0.0.1:9107" + mds8Addr = "127.0.0.1:9108" + mds9Addr = "127.0.0.1:9109" mms1Addr = "127.0.0.1:8101" mms2Addr = "127.0.0.1:8102" @@ -50,9 +54,13 @@ const ( mms4Addr = "127.0.0.1:8104" mms5Addr = "127.0.0.1:8105" mms6Addr = "127.0.0.1:8106" + mms7Addr = "127.0.0.1:8107" + mms8Addr = "127.0.0.1:8108" + mms9Addr = "127.0.0.1:8109" commonVolName = "commonVol" testZone1 = "zone1" testZone2 = "zone2" + testZone3 = "zone3" testUserID = "testUser" ak = "0123456789123456" @@ -85,6 +93,7 @@ func createDefaultMasterServerForTest() *Server { if err != nil { panic(err) } + testServer.config.nodeSetCapacity = defaultNodeSetCapacity //add data node addDataServer(mds1Addr, testZone1) addDataServer(mds2Addr, testZone1) @@ -98,6 +107,9 @@ func createDefaultMasterServerForTest() *Server { addMetaServer(mms4Addr, testZone2) addMetaServer(mms5Addr, testZone2) time.Sleep(5 * time.Second) + testServer.cluster.cfg = newClusterConfig() + testServer.cluster.cfg.DataPartitionsRecoverPoolSize = maxDataPartitionsRecoverPoolSize + testServer.cluster.cfg.MetaPartitionsRecoverPoolSize = maxMetaPartitionsRecoverPoolSize testServer.cluster.checkDataNodeHeartbeat() testServer.cluster.checkMetaNodeHeartbeat() time.Sleep(5 * time.Second) @@ -180,14 +192,28 @@ func createMasterServer(cfgJSON string) (server *Server, err error) { return server, nil } -func addDataServer(addr, zoneName string) { - mds := mocktest.NewMockDataServer(addr, zoneName) +func addDataServer(addr, zoneName string) (mds *mocktest.MockDataServer) { + mds = mocktest.NewMockDataServer(addr, zoneName) mds.Start() + return mds } -func addMetaServer(addr, zoneName string) { - mms := mocktest.NewMockMetaServer(addr, zoneName) +func stopDataServer(mds *mocktest.MockDataServer) { + dn, _ := server.cluster.dataNode(mds.TcpAddr) + server.cluster.delDataNodeFromCache(dn) + mds.Stop() +} + +func addMetaServer(addr, zoneName string) (mms *mocktest.MockMetaServer) { + mms = mocktest.NewMockMetaServer(addr, zoneName) mms.Start() + return mms +} + +func stopMetaServer(mms *mocktest.MockMetaServer) { + mn, _ := server.cluster.metaNode(mms.TcpAddr) + server.cluster.deleteMetaNodeFromCache(mn) + mms.Stop() } func TestSetMetaNodeThreshold(t *testing.T) { @@ -294,7 +320,7 @@ func decommissionDisk(addr, path string, t *testing.T) { func TestMarkDeleteVol(t *testing.T) { name := "delVol" - createVol(name, t) + createVol(name, testZone2, t) reqURL := fmt.Sprintf("%v%v?name=%v&authKey=%v", hostAddr, proto.AdminDeleteVol, name, buildAuthKey("cfs")) process(reqURL, t) userInfo, err := server.user.getUserInfo("cfs") @@ -472,7 +498,7 @@ func TestGetMetaNode(t *testing.T) { func TestAddDataReplica(t *testing.T) { partition := commonVol.dataPartitions.partitions[0] - dsAddr := "127.0.0.1:9106" + dsAddr := mds6Addr addDataServer(dsAddr, "zone2") reqURL := fmt.Sprintf("%v%v?id=%v&addr=%v", hostAddr, proto.AdminAddDataReplica, partition.PartitionID, dsAddr) process(reqURL, t) @@ -499,7 +525,7 @@ func TestAddDataReplica(t *testing.T) { func TestRemoveDataReplica(t *testing.T) { partition := commonVol.dataPartitions.partitions[0] partition.isRecover = false - dsAddr := "127.0.0.1:9106" + dsAddr := mds6Addr reqURL := fmt.Sprintf("%v%v?id=%v&addr=%v", hostAddr, proto.AdminDeleteDataReplica, partition.PartitionID, dsAddr) process(reqURL, t) partition.RLock() @@ -519,7 +545,7 @@ func TestAddMetaReplica(t *testing.T) { t.Error("no meta partition") return } - msAddr := "127.0.0.1:8009" + msAddr := mms9Addr addMetaServer(msAddr, testZone2) server.cluster.checkMetaNodeHeartbeat() time.Sleep(2 * time.Second) @@ -542,7 +568,7 @@ func TestRemoveMetaReplica(t *testing.T) { return } partition.IsRecover = false - msAddr := "127.0.0.1:8009" + msAddr := mms9Addr reqURL := fmt.Sprintf("%v%v?id=%v&addr=%v", hostAddr, proto.AdminDeleteMetaReplica, partition.PartitionID, msAddr) process(reqURL, t) partition.RLock() diff --git a/master/cluster.go b/master/cluster.go index 98dc8801a0..daee6619f8 100644 --- a/master/cluster.go +++ b/master/cluster.go @@ -16,6 +16,8 @@ package master import ( "fmt" + "sort" + "strings" "sync" "sync/atomic" "time" @@ -49,13 +51,36 @@ type Cluster struct { volStatInfo sync.Map BadDataPartitionIds *sync.Map BadMetaPartitionIds *sync.Map + MigratedMetaPartitionIds *sync.Map + MigratedDataPartitionIds *sync.Map DisableAutoAllocate bool fsm *MetadataFsm partition raftstore.Partition MasterSecretKey []byte lastMasterZoneForDataNode string lastMasterZoneForMetaNode string + lastPermutationsForZone uint8 + dpRepairChan chan *RepairTask + mpRepairChan chan *RepairTask } +type ( + RepairType uint8 +) + +const ( + BalanceMetaZone RepairType = iota + BalanceDataZone + RepairMetaDecommission + RepairDataDecommission + RepairAddReplica +) + +type RepairTask struct { + RType RepairType + Pid uint64 + OfflineAddr string +} +type ChooseDataHostFunc func(c *Cluster, offlineAddr string, dp *DataPartition, excludeNodeSets []uint64, zoneName string, destZoneName string) (oldAddr, newAddr string, err error) func newCluster(name string, leaderInfo *LeaderInfo, fsm *MetadataFsm, partition raftstore.Partition, cfg *clusterConfig) (c *Cluster) { c = new(Cluster) @@ -66,12 +91,16 @@ func newCluster(name string, leaderInfo *LeaderInfo, fsm *MetadataFsm, partition c.t = newTopology() c.BadDataPartitionIds = new(sync.Map) c.BadMetaPartitionIds = new(sync.Map) + c.MigratedDataPartitionIds = new(sync.Map) + c.MigratedMetaPartitionIds = new(sync.Map) c.dataNodeStatInfo = new(nodeStatInfo) c.metaNodeStatInfo = new(nodeStatInfo) c.zoneStatInfos = make(map[string]*proto.ZoneStat) c.fsm = fsm c.partition = partition c.idAlloc = newIDAllocator(c.fsm.store, c.partition) + c.initDpRepairChan() + c.initMpRepairChan() return } @@ -88,6 +117,9 @@ func (c *Cluster) scheduleTask() { c.scheduleToCheckMetaPartitionRecoveryProgress() c.scheduleToLoadMetaPartitions() c.scheduleToReduceReplicaNum() + c.scheduleToRepairMultiZoneMetaPartitions() + c.scheduleToRepairMultiZoneDataPartitions() + } func (c *Cluster) masterAddr() (addr string) { @@ -319,6 +351,196 @@ func (c *Cluster) checkVolReduceReplicaNum() { vol.checkReplicaNum(c) } } +func (c *Cluster) repairDataPartition(wg sync.WaitGroup) { + for i := 0; i < cap(c.dpRepairChan); i++ { + select { + case task := <-c.dpRepairChan: + wg.Add(1) + go func(c *Cluster, task *RepairTask) { + var err error + defer func() { + wg.Done() + if err != nil { + log.LogErrorf("ClusterID[%v], Action[repairDataPartition], err[%v]", c.Name, err) + } + }() + var dp *DataPartition + if dp, err = c.getDataPartitionByID(task.Pid); err != nil { + return + } + switch task.RType { + case BalanceDataZone: + if err = c.decommissionDataPartition("", dp, getTargetAddressForBalanceDataPartitionZone, balanceDataPartitionZoneErr, "", false); err != nil { + return + } + Warn(c.Name, fmt.Sprintf("action[repairDataPartition] clusterID[%v] vol[%v] data partition[%v] "+ + "Repair success, type[%v]", c.Name, dp.VolName, dp.PartitionID, task.RType)) + default: + err = fmt.Errorf("action[repairDataPartition] unknown repair task type") + return + } + }(c, task) + default: + time.Sleep(time.Second * 2) + } + } +} + +func (c *Cluster) repairMetaPartition(wg sync.WaitGroup) { + for i := 0; i < cap(c.mpRepairChan); i++ { + select { + case task := <-c.mpRepairChan: + wg.Add(1) + go func(c *Cluster, task *RepairTask) { + var err error + defer func() { + wg.Done() + if err != nil { + log.LogErrorf("ClusterID[%v], Action[repairMetaPartition], err[%v]", c.Name, err) + } + }() + var mp *MetaPartition + if mp, err = c.getMetaPartitionByID(task.Pid); err != nil { + return + } + switch task.RType { + case BalanceMetaZone: + if err = c.decommissionMetaPartition("", mp, getTargetAddressForRepairMetaZone, false); err != nil { + return + } + Warn(c.Name, fmt.Sprintf("action[repairMetaPartition] clusterID[%v] vol[%v] meta partition[%v] "+ + "Repair success, task type[%v]", c.Name, mp.volName, mp.PartitionID, task.RType)) + default: + err = fmt.Errorf("action[repairMetaPartition] unknown repair task type") + return + } + }(c, task) + default: + time.Sleep(time.Second * 2) + } + } +} +func (c *Cluster) dataPartitionInRecovering() (num int) { + c.BadDataPartitionIds.Range(func(key, value interface{}) bool { + badDataPartitionIds := value.([]uint64) + num = num + len(badDataPartitionIds) + return true + }) + + return +} + +func (c *Cluster) metaPartitionInRecovering() (num int) { + c.BadMetaPartitionIds.Range(func(key, value interface{}) bool { + badMetaPartitionIds := value.([]uint64) + num = num + len(badMetaPartitionIds) + return true + }) + return +} +func (c *Cluster) scheduleToRepairMultiZoneMetaPartitions() { + //consumer + go func() { + for { + var wg sync.WaitGroup + c.repairMetaPartition(wg) + wg.Wait() + time.Sleep(time.Second * defaultIntervalToCheckDataPartition) + } + }() + //producer + go func() { + for { + if c.partition != nil && c.partition.IsRaftLeader() && !c.t.isSingleZone() { + c.checkVolRepairMetaPartitions() + } + time.Sleep(time.Second * defaultIntervalToCheckDataPartition) + } + }() +} + +func (c *Cluster) checkVolRepairMetaPartitions() { + defer func() { + if r := recover(); r != nil { + log.LogWarnf("checkVolRepairMetaPartitions occurred panic,err[%v]", r) + WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName), + "checkVolRepairMetaPartitions occurred panic") + } + }() + var mpInRecover uint64 + if c.DisableAutoAllocate || c.cfg.MetaPartitionsRecoverPoolSize == defaultRecoverPoolSize { + return + } + mpInRecover = uint64(c.metaPartitionInRecovering()) + if int32(mpInRecover) > c.cfg.MetaPartitionsRecoverPoolSize { + log.LogInfof("action[checkVolRepairMetaPartitions] clusterID[%v]Recover pool is full, recover partition[%v], pool size[%v]", c.Name, mpInRecover, c.cfg.MetaPartitionsRecoverPoolSize) + return + } + vols := c.allVols() + for _, vol := range vols { + if !vol.autoRepair { + continue + } + if isValid, _ := c.isValidZone(vol.zoneName); !isValid { + log.LogWarnf("checkVolRepairMetaPartitions, vol[%v], zoneName[%v] not valid, skip repair", vol.Name, vol.zoneName) + continue + } + vol.checkRepairMetaPartitions(c) + } +} + +func (c *Cluster) scheduleToRepairMultiZoneDataPartitions() { + //consumer + go func() { + for { + var wg sync.WaitGroup + c.repairDataPartition(wg) + wg.Wait() + time.Sleep(time.Second * defaultIntervalToCheckDataPartition) + } + }() + //producer + go func() { + for { + if c.partition != nil && c.partition.IsRaftLeader() && !c.t.isSingleZone() { + c.checkVolRepairDataPartitions() + } + time.Sleep(time.Second * defaultIntervalToCheckDataPartition) + } + }() +} + +func (c *Cluster) checkVolRepairDataPartitions() { + defer func() { + if r := recover(); r != nil { + log.LogWarnf("checkVolRepairDataPartitions occurred panic,err[%v]", r) + WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName), + "checkVolRepairDataPartitions occurred panic") + } + }() + var dpInRecover int + if c.DisableAutoAllocate || c.cfg.DataPartitionsRecoverPoolSize == defaultRecoverPoolSize { + return + } + dpInRecover = c.dataPartitionInRecovering() + if int32(dpInRecover) >= c.cfg.DataPartitionsRecoverPoolSize { + log.LogInfof("action[checkVolRepairDataPartitions] clusterID[%v] Recover pool is full, recover partition[%v], pool size[%v]", c.Name, dpInRecover, c.cfg.DataPartitionsRecoverPoolSize) + return + } + + vols := c.allVols() + for _, vol := range vols { + if !vol.autoRepair { + continue + } + if isValid, _ := c.isValidZone(vol.zoneName); !isValid { + log.LogWarnf("checkVolRepairDataPartitions, vol[%v], zoneName[%v] not valid, skip repair", vol.Name, vol.zoneName) + continue + } + vol.checkRepairDataPartitions(c) + } +} + func (c *Cluster) getInvalidIDNodes() (nodes []*InvalidNodeView) { metaNodes := c.getNotConsistentIDMetaNodes() @@ -638,17 +860,11 @@ func (c *Cluster) markDeleteVol(name, authKey string) (err error) { } func (c *Cluster) batchCreateDataPartition(vol *Vol, reqCount int) (err error) { - var zoneNum int for i := 0; i < reqCount; i++ { if c.DisableAutoAllocate { return } - zoneNum = c.decideZoneNum(vol.crossZone) - //most of partitions are replicated across 3 zones,but a few partitions are replicated across 2 zones - if vol.crossZone && i%5 == 0 { - zoneNum = 2 - } - if _, err = c.createDataPartition(vol.Name, zoneNum); err != nil { + if _, err = c.createDataPartition(vol.Name); err != nil { log.LogErrorf("action[batchCreateDataPartition] after create [%v] data partition,occurred error,err[%v]", i, err) break } @@ -662,7 +878,7 @@ func (c *Cluster) batchCreateDataPartition(vol *Vol, reqCount int) (err error) { // 3. Communicate with the data node to synchronously create a data partition. // - If succeeded, replicate the data through raft and persist it to RocksDB. // - Otherwise, throw errors -func (c *Cluster) createDataPartition(volName string, zoneNum int) (dp *DataPartition, err error) { +func (c *Cluster) createDataPartition(volName string) (dp *DataPartition, err error) { var ( vol *Vol partitionID uint64 @@ -677,7 +893,7 @@ func (c *Cluster) createDataPartition(volName string, zoneNum int) (dp *DataPart vol.createDpMutex.Lock() defer vol.createDpMutex.Unlock() errChannel := make(chan error, vol.dpReplicaNum) - if targetHosts, targetPeers, err = c.chooseTargetDataNodes("", nil, nil, int(vol.dpReplicaNum), zoneNum, vol.zoneName); err != nil { + if targetHosts, targetPeers, err = c.chooseTargetDataNodes("", nil, nil, int(vol.dpReplicaNum), vol.zoneName); err != nil { goto errHandler } if partitionID, err = c.idAlloc.allocateDataPartitionID(); err != nil { @@ -769,117 +985,236 @@ func (c *Cluster) syncCreateMetaPartitionToMetaNode(host string, mp *MetaPartiti return } -//decideZoneNum -//if vol is not cross zone, return 1 -//if vol enable cross zone and the zone number of cluster less than defaultReplicaNum return 2 -//otherwise, return defaultReplicaNum -func (c *Cluster) decideZoneNum(crossZone bool) (zoneNum int) { - if !crossZone { - return 1 +func (c *Cluster) isValidZone(zoneName string) (isValid bool, err error) { + isValid = true + if zoneName == "" { + isValid = false + return } - zoneLen := c.t.zoneLen() - if zoneLen < defaultReplicaNum { - zoneNum = 2 - } else { - zoneNum = defaultReplicaNum + zoneList := strings.Split(zoneName, ",") + for _, name := range zoneList { + if _, err = c.t.getZone(name); err != nil { + isValid = false + return + } } - return zoneNum + return } -func (c *Cluster) chooseTargetDataNodes(excludeZone string, excludeNodeSets []uint64, excludeHosts []string, replicaNum int, zoneNum int, specifiedZone string) (hosts []string, peers []proto.Peer, err error) { - var ( - masterZone *Zone - zones []*Zone - ) - excludeZones := make([]string, 0) - if excludeZone != "" { - excludeZones = append(excludeZones, excludeZone) +//valid zone name +//if zone name duplicate, return error +//if vol enable cross zone and the zone number of cluster less than defaultReplicaNum return error +func (c *Cluster) validZone(zoneName string, replicaNum int) (err error) { + var crossZone bool + if zoneName == "" { + err = fmt.Errorf("zone name empty") + return } - if replicaNum <= zoneNum { - zoneNum = replicaNum + + zoneList := strings.Split(zoneName, ",") + sort.Strings(zoneList) + if len(zoneList) > 1 { + crossZone = true } - // when creating vol,user specified a zone,we reset zoneNum to 1,to be created partition with specified zone, - //if specified zone is not writable,we choose a zone randomly - if specifiedZone != "" { - zoneNum = 1 - zone, err := c.t.getZone(specifiedZone) - if err != nil { - Warn(c.Name, fmt.Sprintf("cluster[%v],specified zone[%v]is not writable", c.Name, specifiedZone)) - } else { - zones = make([]*Zone, 0) - zones = append(zones, zone) - } + if crossZone && c.t.zoneLen() <= 1 { + return fmt.Errorf("cluster has one zone,can't cross zone") } - if zones == nil || specifiedZone == "" { - if zones, err = c.t.allocZonesForDataNode(zoneNum, replicaNum, excludeZones); err != nil { + for _, name := range zoneList { + if _, err = c.t.getZone(name); err != nil { return } } - //if vol enable cross zone,available zone less than 2,can't create partition - if zoneNum >= 2 && len(zones) < 2 { - return nil, nil, fmt.Errorf("no enough zones[%v] to be selected,crossNum[%v]", len(zones), zoneNum) + if len(zoneList) == 1 { + return + } + if len(zoneList) > replicaNum { + err = fmt.Errorf("can not specify zone number[%v] more than replica number[%v]", len(zoneList), replicaNum) } - if len(zones) == 1 { - if hosts, peers, err = zones[0].getAvailDataNodeHosts(excludeNodeSets, excludeHosts, replicaNum); err != nil { - log.LogErrorf("action[chooseTargetDataNodes],err[%v]", err) + if len(zoneList) > defaultReplicaNum { + err = fmt.Errorf("can not specify zone number[%v] more than %v", len(zoneList), defaultReplicaNum) + } + //if length of zoneList more than 1, there should not be duplicate zone names + for i := 0; i < len(zoneList)-1; i++ { + if zoneList[i] == zoneList[i+1] { + err = fmt.Errorf("duplicate zone:[%v]", zoneList[i]) return } - goto result } + return +} + +func (c *Cluster) chooseTargetDataNodes(excludeZone string, excludeNodeSets []uint64, excludeHosts []string, replicaNum int, zoneName string) (hosts []string, peers []proto.Peer, err error) { + + var ( + zones []*Zone + ) + allocateZoneMap := make(map[*Zone][]string, 0) + hasAllocateNum := 0 + excludeZones := make([]string, 0) hosts = make([]string, 0) peers = make([]proto.Peer, 0) if excludeHosts == nil { excludeHosts = make([]string, 0) } - //replicaNum is equal with the number of allocated zones - if replicaNum == len(zones) { + + if excludeZone != "" { + excludeZones = append(excludeZones, excludeZone) + } + zoneList := strings.Split(zoneName, ",") + if zones, err = c.t.allocZonesForDataNode(zoneName, replicaNum, excludeZones); err != nil { + return + } + + if len(zones) == 1 && len(zoneList) == 1 { + if hosts, peers, err = zones[0].getAvailDataNodeHosts(excludeNodeSets, excludeHosts, replicaNum); err != nil { + log.LogErrorf("action[chooseTargetDataNodes],err[%v]", err) + return + } + goto result + } + // Different from the meta partition whose replicas fully fills the 3 zones, + // each data partition just fills 2 zones to decrease data transfer across zones. + // Loop through the 3-zones permutation according to the lastPermutationsForZone + // to choose 2 zones for each partition. + // e.g.[zone0, zone0, zone1] -> [zone1, zone1, zone2] -> [zone2, zone2, zone0] + // -> [zone1, zone1, zone0] -> [zone2, zone2, zone1] -> [zone0, zone0, zone2] + // If [zone0, zone1] is chosen for a partition with 3 replicas, 2 replicas will be allocated to zone0, + // the rest one will be allocated to zone1. + if len(zones) == 2 { + switch c.lastPermutationsForZone % 2 { + case 0: + zones = append(make([]*Zone, 0), zones[0], zones[1]) + c.lastPermutationsForZone = (c.lastPermutationsForZone + 1) % 6 + default: + zones = append(make([]*Zone, 0), zones[1], zones[0]) + c.lastPermutationsForZone = (c.lastPermutationsForZone + 1) % 6 + } + } + if len(zones) == 3 { + switch c.lastPermutationsForZone < 3 { + case true: + index := c.lastPermutationsForZone + zones = append(make([]*Zone, 0), zones[index], zones[index], zones[(index+1)%3]) + c.lastPermutationsForZone = (c.lastPermutationsForZone + 1) % 6 + default: + index := c.lastPermutationsForZone - 3 + zones = append(make([]*Zone, 0), zones[(index+1)%3], zones[(index+1)%3], zones[index]) + c.lastPermutationsForZone = (c.lastPermutationsForZone + 1) % 6 + } + } + for hasAllocateNum < replicaNum { + localExcludeHosts := excludeHosts for _, zone := range zones { - selectedHosts, selectedPeers, e := zone.getAvailDataNodeHosts(excludeNodeSets, excludeHosts, 1) + localExcludeHosts = append(localExcludeHosts, allocateZoneMap[zone]...) + selectedHosts, selectedPeers, e := zone.getAvailDataNodeHosts(excludeNodeSets, localExcludeHosts, 1) if e != nil { return nil, nil, errors.NewError(e) } hosts = append(hosts, selectedHosts...) peers = append(peers, selectedPeers...) + allocateZoneMap[zone] = append(allocateZoneMap[zone], selectedHosts...) + hasAllocateNum = hasAllocateNum + 1 + if hasAllocateNum == replicaNum { + break + } } - goto result } - - // replicaNum larger than the number of allocated zones - for _, zone := range zones { - if zone.name != c.lastMasterZoneForDataNode { - masterZone = zone - c.lastMasterZoneForDataNode = zone.name - break + goto result +result: + log.LogInfof("action[chooseTargetDataNodes] replicaNum[%v],zoneName[%v],selectedZones[%v],hosts[%v]", replicaNum, zoneName, len(zones), hosts) + if len(hosts) != replicaNum { + log.LogErrorf("action[chooseTargetDataNodes] replicaNum[%v],zoneName[%v],selectedZones[%v],hosts[%v]", replicaNum, zoneName, len(zones), hosts) + return nil, nil, errors.Trace(proto.ErrNoDataNodeToCreateDataPartition, "hosts len[%v],replicaNum[%v],zoneName[%v],selectedZones[%v]", + len(hosts), replicaNum, zoneName, len(zones)) + } + return +} +func (c *Cluster) chooseTargetDataNodesForDecommission(excludeZone string, dp *DataPartition, excludeHosts []string, replicaNum int, zoneName string) (hosts []string, peers []proto.Peer, err error) { + var zones []*Zone + var targetZone *Zone + zones = make([]*Zone, 0) + zoneList := strings.Split(zoneName, ",") + for _, z := range zoneList { + var zone *Zone + if zone, err = c.t.getZone(z); err != nil { + return } + zones = append(zones, zone) } - if masterZone == nil { - masterZone = zones[0] + //if not cross zone, choose a zone from all zones + if len(zoneList) <= 1 { + zones = c.t.getAllZones() } - for _, zone := range zones { - if zone.name == masterZone.name { - rNum := replicaNum - len(zones) + 1 - selectedHosts, selectedPeers, e := zone.getAvailDataNodeHosts(excludeNodeSets, excludeHosts, rNum) - if e != nil { - return nil, nil, errors.NewError(e) + demandWriteNodes := 1 + candidateZones := make([]*Zone, 0) + for _, z := range zones { + if z.status == unavailableZone { + continue + } + if excludeZone == z.name { + continue + } + if z.canWriteForDataNode(uint8(demandWriteNodes)) { + candidateZones = append(candidateZones, z) + } + } + //must have a candidate zone + if len(candidateZones) < 1 { + log.LogError(fmt.Sprintf("action[allocZonesForDataNode],there are no candidateZones, demandWriteNodes[%v], err:%v", + demandWriteNodes, proto.ErrNoZoneToCreateDataPartition)) + return nil, nil, proto.ErrNoZoneToCreateDataPartition + } + //choose target zone for single zone partition + if len(zoneList) == 1 { + for index, zone := range candidateZones { + if c.lastMasterZoneForDataNode == "" { + targetZone = zone + c.lastMasterZoneForDataNode = targetZone.name + break } - hosts = append(hosts, selectedHosts...) - peers = append(peers, selectedPeers...) - } else { - selectedHosts, selectedPeers, e := zone.getAvailDataNodeHosts(excludeNodeSets, excludeHosts, 1) - if e != nil { - return nil, nil, errors.NewError(e) + if zone.name == c.lastMasterZoneForDataNode { + if index == len(candidateZones)-1 { + targetZone = candidateZones[0] + } else { + targetZone = candidateZones[index+1] + } + c.lastMasterZoneForDataNode = targetZone.name + break } - hosts = append(hosts, selectedHosts...) - peers = append(peers, selectedPeers...) + } + if targetZone == nil { + targetZone = candidateZones[0] + c.lastMasterZoneForDataNode = targetZone.name } } -result: - log.LogInfof("action[chooseTargetDataNodes] replicaNum[%v],zoneNum[%v],selectedZones[%v],hosts[%v]", replicaNum, zoneNum, len(zones), hosts) - if len(hosts) != replicaNum { - log.LogErrorf("action[chooseTargetDataNodes] replicaNum[%v],zoneNum[%v],selectedZones[%v],hosts[%v]", replicaNum, zoneNum, len(zones), hosts) - return nil, nil, errors.Trace(proto.ErrNoDataNodeToCreateDataPartition, "hosts len[%v],replicaNum[%v],zoneNum[%v],selectedZones[%v]", - len(hosts), replicaNum, zoneNum, len(zones)) + //choose target zone for cross zone partition + if len(zoneList) > 1 { + var curZonesMap map[string]uint8 + if curZonesMap, err = dp.getDataZoneMap(c); err != nil { + return + } + //avoid change from 2 zones to 1 zone after decommission + if len(curZonesMap) == 2 && curZonesMap[excludeZone] == 1 { + for k := range curZonesMap { + if k == excludeZone { + continue + } + for _, z := range candidateZones { + if z.name == k { + continue + } + targetZone = z + } + } + } else { + targetZone = candidateZones[0] + } } + if targetZone == nil { + err = fmt.Errorf("no candidate zones available") + return + } + hosts, peers, err = targetZone.getAvailDataNodeHosts(nil, excludeHosts, 1) return } @@ -987,8 +1322,9 @@ func (c *Cluster) getAllMetaPartitionsByMetaNode(addr string) (partitions []*Met return } -func (c *Cluster) decommissionDataNode(dataNode *DataNode) (err error) { - msg := fmt.Sprintf("action[decommissionDataNode], Node[%v] OffLine", dataNode.Addr) + +func (c *Cluster) decommissionDataNode(dataNode *DataNode, destZoneName string, strictFlag bool) (err error) { + msg := fmt.Sprintf("action[decommissionDataNode], Node[%v],strictMode[%v] OffLine", dataNode.Addr, strictFlag) log.LogWarn(msg) var wg sync.WaitGroup dataNode.ToBeOffline = true @@ -996,14 +1332,16 @@ func (c *Cluster) decommissionDataNode(dataNode *DataNode) (err error) { partitions := c.getAllDataPartitionByDataNode(dataNode.Addr) errChannel := make(chan error, len(partitions)) defer func() { - dataNode.ToBeOffline = false + if err != nil { + dataNode.ToBeOffline = false + } close(errChannel) }() for _, dp := range partitions { wg.Add(1) go func(dp *DataPartition) { defer wg.Done() - if err1 := c.decommissionDataPartition(dataNode.Addr, dp, dataNodeOfflineErr); err1 != nil { + if err1 := c.decommissionDataPartition(dataNode.Addr, dp, getTargetAddressForDataPartitionDecommission, dataNodeOfflineErr, destZoneName, strictFlag); err1 != nil { errChannel <- err1 } }(dp) @@ -1033,7 +1371,9 @@ func (c *Cluster) delDataNodeFromCache(dataNode *DataNode) { go dataNode.clean() } -// Decommission a data partition. +// Decommission a data partition.In strict mode, only if the size of the replica is equal, +// or the number of files is equal, the recovery is considered complete. when it is triggered by migrated dataNode, +// the strict mode is true,otherwise is false. // 1. Check if we can decommission a data partition. In the following cases, we are not allowed to do so: // - (a) a replica is not in the latest host list; // - (b) there is already a replica been taken offline; @@ -1043,96 +1383,170 @@ func (c *Cluster) delDataNodeFromCache(dataNode *DataNode) { // 4. synchronized create a new data partition // 5. Set the data partition as readOnly. // 6. persistent the new host list -func (c *Cluster) decommissionDataPartition(offlineAddr string, dp *DataPartition, errMsg string) (err error) { +func (c *Cluster) decommissionDataPartition(offlineAddr string, dp *DataPartition, chooseDataHostFunc ChooseDataHostFunc, errMsg, destZoneName string, strictMode bool) (err error) { var ( - targetHosts []string - newAddr string - msg string - dataNode *DataNode - zone *Zone - replica *DataReplica - ns *nodeSet + oldAddr string + addAddr string + dpReplica *DataReplica excludeNodeSets []uint64 - zones []string - excludeZone string + msg string + vol *Vol ) - dp.RLock() - if ok := dp.hasHost(offlineAddr); !ok { - dp.RUnlock() - return - } - replica, _ = dp.getReplica(offlineAddr) - dp.RUnlock() - if err = c.validateDecommissionDataPartition(dp, offlineAddr); err != nil { - goto errHandler - } - - if dataNode, err = c.dataNode(offlineAddr); err != nil { - goto errHandler - } - - if dataNode.ZoneName == "" { - err = fmt.Errorf("dataNode[%v] zone is nil", dataNode.Addr) + dp.offlineMutex.Lock() + defer dp.offlineMutex.Unlock() + excludeNodeSets = make([]uint64, 0) + if vol, err = c.getVol(dp.VolName); err != nil { goto errHandler } - if zone, err = c.t.getZone(dataNode.ZoneName); err != nil { + if oldAddr, addAddr, err = chooseDataHostFunc(c, offlineAddr, dp, excludeNodeSets, vol.zoneName, destZoneName); err != nil { goto errHandler } - if ns, err = zone.getNodeSet(dataNode.NodeSetID); err != nil { + if dpReplica, err = dp.getReplica(oldAddr); err != nil { goto errHandler } - if targetHosts, _, err = ns.getAvailDataNodeHosts(dp.Hosts, 1); err != nil { - // select data nodes from the other node set in same zone - excludeNodeSets = append(excludeNodeSets, ns.ID) - if targetHosts, _, err = zone.getAvailDataNodeHosts(excludeNodeSets, dp.Hosts, 1); err != nil { - // select data nodes from the other zone - zones = dp.getLiveZones(offlineAddr) - if len(zones) == 0 { - excludeZone = zone.name - } else { - excludeZone = zones[0] - } - if targetHosts, _, err = c.chooseTargetDataNodes(excludeZone, excludeNodeSets, dp.Hosts, 1, 1, ""); err != nil { - goto errHandler - } - } - } - if err = c.removeDataReplica(dp, offlineAddr, false); err != nil { - goto errHandler + if err = c.removeDataReplica(dp, oldAddr, false, strictMode); err != nil { + return } - newAddr = targetHosts[0] - if err = c.addDataReplica(dp, newAddr); err != nil { - goto errHandler + if err = c.addDataReplica(dp, addAddr); err != nil { + return } + dp.Lock() dp.Status = proto.ReadOnly dp.isRecover = true - c.putBadDataPartitionIDs(replica, offlineAddr, dp.PartitionID) - dp.RLock() c.syncUpdateDataPartition(dp) - dp.RUnlock() - log.LogWarnf("clusterID[%v] partitionID:%v on Node:%v offline success,newHost[%v],PersistenceHosts:[%v]", - c.Name, dp.PartitionID, offlineAddr, newAddr, dp.Hosts) + dp.Unlock() + if strictMode { + c.putMigratedDataPartitionIDs(dpReplica, oldAddr, dp.PartitionID) + } else { + c.putBadDataPartitionIDs(dpReplica, oldAddr, dp.PartitionID) + } return errHandler: - msg = fmt.Sprintf(errMsg+" clusterID[%v] partitionID:%v on Node:%v "+ + msg = errMsg + fmt.Sprintf("clusterID[%v] partitionID:%v on Node:%v "+ "Then Fix It on newHost:%v Err:%v , PersistenceHosts:%v ", - c.Name, dp.PartitionID, offlineAddr, newAddr, err, dp.Hosts) + c.Name, dp.PartitionID, oldAddr, addAddr, err, dp.Hosts) if err != nil { Warn(c.Name, msg) err = fmt.Errorf("vol[%v],partition[%v],err[%v]", dp.VolName, dp.PartitionID, err) } return } +func (partition *DataPartition) RepairZone(vol *Vol, c *Cluster) (err error) { + var ( + zoneList []string + isNeedBalance bool + ) + partition.RLock() + defer partition.RUnlock() + var isValidZone bool + if isValidZone, err = c.isValidZone(vol.zoneName); err != nil { + return + } + if !isValidZone { + log.LogWarnf("action[RepairZone], vol[%v], zoneName[%v], dpReplicaNum[%v] can not be automatically repaired", vol.Name, vol.zoneName, vol.dpReplicaNum) + return + } + rps := partition.liveReplicas(defaultDataPartitionTimeOutSec) + if len(rps) < int(vol.dpReplicaNum) { + log.LogWarnf("action[RepairZone], vol[%v], zoneName[%v], live Replicas [%v] less than dpReplicaNum[%v], can not be automatically repaired", vol.Name, vol.zoneName, len(rps), vol.dpReplicaNum) + return + } + zoneList = strings.Split(vol.zoneName, ",") + if len(partition.Replicas) != int(vol.dpReplicaNum) { + log.LogWarnf("action[RepairZone], data replica length[%v] not equal to dpReplicaNum[%v]", len(partition.Replicas), vol.dpReplicaNum) + return + } + if partition.isRecover { + log.LogWarnf("action[RepairZone], data partition[%v] is recovering", partition.PartitionID) + return + } + var dpInRecover int + dpInRecover = c.dataPartitionInRecovering() + if int32(dpInRecover) >= c.cfg.DataPartitionsRecoverPoolSize { + log.LogWarnf("action[repairDataPartition] clusterID[%v] Recover pool is full, recover partition[%v], pool size[%v]", c.Name, dpInRecover, c.cfg.DataPartitionsRecoverPoolSize) + return + } + if isNeedBalance, err = partition.needToRebalanceZone(c, zoneList); err != nil { + return + } + if !isNeedBalance { + return + } + if err = c.sendRepairDataPartitionTask(partition, BalanceDataZone); err != nil { + return + } + return +} + +var getTargetAddressForDataPartitionDecommission = func(c *Cluster, offlineAddr string, dp *DataPartition, excludeNodeSets []uint64, zoneName string, destZoneName string) (oldAddr, newAddr string, err error) { + var ( + dataNode *DataNode + zone *Zone + zones []string + ns *nodeSet + excludeZone string + targetHosts []string + ) + if err = c.validateDecommissionDataPartition(dp, offlineAddr); err != nil { + return + } + if dataNode, err = c.dataNode(offlineAddr); err != nil { + return + } + if destZoneName != "" { + if zone, err = c.t.getZone(destZoneName); err != nil { + return + } + if targetHosts, _, err = zone.getAvailDataNodeHosts(excludeNodeSets, dp.Hosts, 1); err != nil { + return + } + } else { + if dataNode.ZoneName == "" { + err = fmt.Errorf("dataNode[%v] zone is nil", dataNode.Addr) + return + } + if zone, err = c.t.getZone(dataNode.ZoneName); err != nil { + return + } + if ns, err = zone.getNodeSet(dataNode.NodeSetID); err != nil { + return + } + if targetHosts, _, err = ns.getAvailDataNodeHosts(dp.Hosts, 1); err != nil { + // select data nodes from the other node set in same zone + excludeNodeSets = append(excludeNodeSets, ns.ID) + if targetHosts, _, err = zone.getAvailDataNodeHosts(excludeNodeSets, dp.Hosts, 1); err != nil { + // select data nodes from the other zone + zones = dp.getLiveZones(dataNode.Addr) + if len(zones) == 0 { + excludeZone = zone.name + } else { + excludeZone = zones[0] + } + if targetHosts, _, err = c.chooseTargetDataNodes(excludeZone, excludeNodeSets, dp.Hosts, 1, zoneName); err != nil { + return + } + } + } + } + newAddr = targetHosts[0] + oldAddr = offlineAddr + return +} func (c *Cluster) validateDecommissionDataPartition(dp *DataPartition, offlineAddr string) (err error) { dp.RLock() defer dp.RUnlock() + if ok := dp.hasHost(offlineAddr); !ok { + err = fmt.Errorf("offline address:[%v] is not in data partition hosts:%v", offlineAddr, dp.Hosts) + return + } + var vol *Vol if vol, err = c.getVol(dp.VolName); err != nil { return } - if err = dp.hasMissingOneReplica(int(vol.dpReplicaNum)); err != nil { + if err = dp.hasMissingOneReplica(offlineAddr, int(vol.dpReplicaNum)); err != nil { return } @@ -1141,7 +1555,7 @@ func (c *Cluster) validateDecommissionDataPartition(dp *DataPartition, offlineAd return } - if dp.isRecover { + if dp.isRecover && !dp.isLatestReplica(offlineAddr) { err = fmt.Errorf("vol[%v],data partition[%v] is recovering,[%v] can't be decommissioned", vol.Name, dp.PartitionID, offlineAddr) return } @@ -1196,29 +1610,12 @@ func (c *Cluster) buildAddDataPartitionRaftMemberTaskAndSyncSendTask(dp *DataPar } func (c *Cluster) addDataPartitionRaftMember(dp *DataPartition, addPeer proto.Peer) (err error) { - dp.Lock() - defer dp.Unlock() - if contains(dp.Hosts, addPeer.Addr) { - err = fmt.Errorf("vol[%v],data partition[%v] has contains host[%v]", dp.VolName, dp.PartitionID, addPeer.Addr) - return - } - var ( - candidateAddrs []string leaderAddr string + candidateAddrs []string ) - candidateAddrs = make([]string, 0, len(dp.Hosts)) - leaderAddr = dp.getLeaderAddr() - if leaderAddr != "" && contains(dp.Hosts, leaderAddr) { - candidateAddrs = append(candidateAddrs, leaderAddr) - } else { - leaderAddr = "" - } - for _, host := range dp.Hosts { - if host == leaderAddr { - continue - } - candidateAddrs = append(candidateAddrs, host) + if leaderAddr, candidateAddrs, err = dp.prepareAddRaftMember(addPeer); err != nil { + return } //send task to leader addr first,if need to retry,then send to other addr for index, host := range candidateAddrs { @@ -1236,13 +1633,16 @@ func (c *Cluster) addDataPartitionRaftMember(dp *DataPartition, addPeer proto.Pe if err != nil { return } + dp.Lock() newHosts := make([]string, 0, len(dp.Hosts)+1) newPeers := make([]proto.Peer, 0, len(dp.Peers)+1) newHosts = append(dp.Hosts, addPeer.Addr) newPeers = append(dp.Peers, addPeer) if err = dp.update("addDataPartitionRaftMember", dp.VolName, newPeers, newHosts, c); err != nil { + dp.Unlock() return } + dp.Unlock() return } @@ -1272,7 +1672,7 @@ func (c *Cluster) createDataReplica(dp *DataPartition, addPeer proto.Peer) (err return } -func (c *Cluster) removeDataReplica(dp *DataPartition, addr string, validate bool) (err error) { +func (c *Cluster) removeDataReplica(dp *DataPartition, addr string, validate, migrationMode bool) (err error) { defer func() { if err != nil { log.LogErrorf("action[removeDataReplica],vol[%v],data partition[%v],err[%v]", dp.VolName, dp.PartitionID, err) @@ -1283,7 +1683,7 @@ func (c *Cluster) removeDataReplica(dp *DataPartition, addr string, validate boo return } } - ok := c.isRecovering(dp, addr) + ok := c.isRecovering(dp, addr) && !dp.isLatestReplica(addr) if ok { err = fmt.Errorf("vol[%v],data partition[%v] can't decommision until it has recovered", dp.VolName, dp.PartitionID) return @@ -1293,10 +1693,11 @@ func (c *Cluster) removeDataReplica(dp *DataPartition, addr string, validate boo return } removePeer := proto.Peer{ID: dataNode.ID, Addr: addr} - if err = c.removeDataPartitionRaftMember(dp, removePeer); err != nil { + + if err = c.removeDataPartitionRaftMember(dp, removePeer, migrationMode); err != nil { return } - if err = c.deleteDataReplica(dp, dataNode); err != nil { + if err = c.deleteDataReplica(dp, dataNode, migrationMode); err != nil { return } leaderAddr := dp.getLeaderAddrWithLock() @@ -1335,12 +1736,12 @@ func (c *Cluster) isRecovering(dp *DataPartition, addr string) (isRecover bool) return } -func (c *Cluster) removeDataPartitionRaftMember(dp *DataPartition, removePeer proto.Peer) (err error) { - dp.offlineMutex.Lock() - defer dp.offlineMutex.Unlock() + +func (c *Cluster) removeDataPartitionRaftMember(dp *DataPartition, removePeer proto.Peer, migrationMode bool) (err error) { defer func() { if err1 := c.updateDataPartitionOfflinePeerIDWithLock(dp, 0); err1 != nil { - err = errors.Trace(err, "updateDataPartitionOfflinePeerIDWithLock failed, err[%v]", err1) } + err = errors.Trace(err, "updateDataPartitionOfflinePeerIDWithLock failed, err[%v]", err1) + } }() if err = c.updateDataPartitionOfflinePeerIDWithLock(dp, removePeer.ID); err != nil { log.LogErrorf("action[removeDataPartitionRaftMember] vol[%v],data partition[%v],err[%v]", dp.VolName, dp.PartitionID, err) @@ -1350,6 +1751,7 @@ func (c *Cluster) removeDataPartitionRaftMember(dp *DataPartition, removePeer pr if err != nil { return } + task.ReserveResource = migrationMode leaderAddr := dp.getLeaderAddr() leaderDataNode, err := c.dataNode(leaderAddr) if _, err = leaderDataNode.TaskManager.syncSendAdminTask(task); err != nil { @@ -1370,12 +1772,14 @@ func (c *Cluster) removeDataPartitionRaftMember(dp *DataPartition, removePeer pr } newPeers = append(newPeers, peer) } + dp.Lock() if err = dp.update("removeDataPartitionRaftMember", dp.VolName, newPeers, newHosts, c); err != nil { + dp.Unlock() return } + dp.Unlock() return } - func (c *Cluster) updateDataPartitionOfflinePeerIDWithLock(dp *DataPartition, peerID uint64) (err error) { dp.Lock() defer dp.Unlock() @@ -1385,7 +1789,10 @@ func (c *Cluster) updateDataPartitionOfflinePeerIDWithLock(dp *DataPartition, pe } return } -func (c *Cluster) deleteDataReplica(dp *DataPartition, dataNode *DataNode) (err error) { + + + +func (c *Cluster) deleteDataReplica(dp *DataPartition, dataNode *DataNode, migrationMode bool) (err error) { dp.Lock() // in case dataNode is unreachable,update meta first. dp.removeReplicaByAddr(dataNode.Addr) @@ -1396,6 +1803,9 @@ func (c *Cluster) deleteDataReplica(dp *DataPartition, dataNode *DataNode) (err } task := dp.createTaskToDeleteDataPartition(dataNode.Addr) dp.Unlock() + if migrationMode { + return + } _, err = dataNode.TaskManager.syncSendAdminTask(task) if err != nil { log.LogErrorf("action[deleteDataReplica] vol[%v],data partition[%v],err[%v]", dp.VolName, dp.PartitionID, err) @@ -1453,7 +1863,8 @@ func (c *Cluster) getBadDataPartitionsView() (bpvs []badPartitionView) { return } -func (c *Cluster) decommissionMetaNode(metaNode *MetaNode) (err error) { + +func (c *Cluster) decommissionMetaNode(metaNode *MetaNode, strictMode bool) (err error) { msg := fmt.Sprintf("action[decommissionMetaNode],clusterID[%v] Node[%v] begin", c.Name, metaNode.Addr) log.LogWarn(msg) var wg sync.WaitGroup @@ -1469,7 +1880,7 @@ func (c *Cluster) decommissionMetaNode(metaNode *MetaNode) (err error) { wg.Add(1) go func(mp *MetaPartition) { defer wg.Done() - if err1 := c.decommissionMetaPartition(metaNode.Addr, mp); err1 != nil { + if err1 := c.decommissionMetaPartition(metaNode.Addr, mp, getTargetAddressForMetaPartitionDecommission, strictMode); err1 != nil { errChannel <- err1 } }(mp) @@ -1498,20 +1909,21 @@ func (c *Cluster) deleteMetaNodeFromCache(metaNode *MetaNode) { go metaNode.clean() } -func (c *Cluster) updateVol(name, authKey string, newArgs *VolVarargs) (err error) { + +func (c *Cluster) updateVol(name, authKey, zoneName, description string, capacity uint64, replicaNum uint8, followerRead, authenticate, enableToken, autoRepair bool) (err error) { var ( - vol *Vol - serverAuthKey string - oldDpReplicaNum uint8 - oldCapacity uint64 - oldFollowerRead bool - oldAuthenticate bool - oldEnableToken bool - oldZoneName string - oldDescription string - oldDpSelectorName string - oldDpSelectorParm string - volUsedSpace uint64 + vol *Vol + serverAuthKey string + oldDpReplicaNum uint8 + oldCapacity uint64 + oldFollowerRead bool + oldAuthenticate bool + oldEnableToken bool + oldAutoRepair bool + oldZoneName string + oldDescription string + oldCrossZone bool + zoneList []string ) if vol, err = c.getVol(name); err != nil { log.LogErrorf("action[updateVol] err[%v]", err) @@ -1524,18 +1936,15 @@ func (c *Cluster) updateVol(name, authKey string, newArgs *VolVarargs) (err erro if !matchKey(serverAuthKey, authKey) { return proto.ErrVolAuthKeyNotMatch } - volUsedSpace = vol.totalUsedSpace() - if float64(newArgs.capacity*util.GB) < float64(volUsedSpace)*1.2 { - err = fmt.Errorf("capacity[%v] has to be 20 percent larger than the used space[%v]", newArgs.capacity, - volUsedSpace/util.GB) - goto errHandler - } - if newArgs.dpReplicaNum > vol.dpReplicaNum { - err = fmt.Errorf("don't support new replicaNum[%v] larger than old dpReplicaNum[%v]", newArgs.dpReplicaNum, - vol.dpReplicaNum) + //if capacity < vol.Capacity { + // err = fmt.Errorf("capacity[%v] less than old capacity[%v]", capacity, vol.Capacity) + // goto errHandler + //} + if replicaNum > vol.dpReplicaNum { + err = fmt.Errorf("don't support new replicaNum[%v] larger than old dpReplicaNum[%v]", replicaNum, vol.dpReplicaNum) goto errHandler } - if newArgs.enableToken == true && len(vol.tokens) == 0 { + if enableToken == true && len(vol.tokens) == 0 { if err = c.createToken(vol, proto.ReadOnlyToken); err != nil { goto errHandler } @@ -1543,43 +1952,42 @@ func (c *Cluster) updateVol(name, authKey string, newArgs *VolVarargs) (err erro goto errHandler } } - - if vol.crossZone && newArgs.zoneName != "" { - err = fmt.Errorf("only the vol which don't across zones,can specified zoneName") - goto errHandler - } - if newArgs.zoneName != "" { - _, err = c.t.getZone(newArgs.zoneName) - if err != nil { + oldZoneName = vol.zoneName + if zoneName != "" { + if err = c.validZone(zoneName, int(replicaNum)); err != nil { goto errHandler } + if err = c.validZone(zoneName, int(vol.mpReplicaNum)); err != nil { + goto errHandler + } + vol.zoneName = zoneName + } + oldCrossZone = vol.crossZone + zoneList = strings.Split(vol.zoneName, ",") + if len(zoneList) > 1 { + vol.crossZone = true + } else { + vol.crossZone = false } - oldCapacity = vol.Capacity oldDpReplicaNum = vol.dpReplicaNum oldFollowerRead = vol.FollowerRead oldAuthenticate = vol.authenticate oldEnableToken = vol.enableToken - oldZoneName = vol.zoneName + oldAutoRepair = vol.autoRepair oldDescription = vol.description - oldDpSelectorName = vol.dpSelectorName - oldDpSelectorParm = vol.dpSelectorParm - - vol.zoneName = newArgs.zoneName - vol.Capacity = newArgs.capacity - vol.FollowerRead = newArgs.followerRead - vol.authenticate = newArgs.authenticate - vol.enableToken = newArgs.enableToken - if newArgs.description != "" { - vol.description = newArgs.description + vol.Capacity = capacity + vol.FollowerRead = followerRead + vol.authenticate = authenticate + vol.enableToken = enableToken + vol.autoRepair = autoRepair + if description != "" { + vol.description = description } //only reduced replica num is supported - if newArgs.dpReplicaNum != 0 && newArgs.dpReplicaNum < vol.dpReplicaNum { - vol.dpReplicaNum = newArgs.dpReplicaNum + if replicaNum != 0 && replicaNum < vol.dpReplicaNum { + vol.dpReplicaNum = replicaNum } - vol.dpSelectorName = newArgs.dpSelectorName - vol.dpSelectorParm = newArgs.dpSelectorParm - if err = c.syncUpdateVol(vol); err != nil { vol.Capacity = oldCapacity vol.dpReplicaNum = oldDpReplicaNum @@ -1587,10 +1995,9 @@ func (c *Cluster) updateVol(name, authKey string, newArgs *VolVarargs) (err erro vol.authenticate = oldAuthenticate vol.enableToken = oldEnableToken vol.zoneName = oldZoneName + vol.crossZone = oldCrossZone + vol.autoRepair = oldAutoRepair vol.description = oldDescription - vol.dpSelectorName = oldDpSelectorName - vol.dpSelectorParm = oldDpSelectorParm - log.LogErrorf("action[updateVol] vol[%v] err[%v]", name, err) err = proto.ErrPersistenceByRaft goto errHandler @@ -1605,7 +2012,7 @@ errHandler: // Create a new volume. // By default we create 3 meta partitions and 10 data partitions during initialization. -func (c *Cluster) createVol(name, owner, zoneName, description string, mpCount, dpReplicaNum, size, capacity int, followerRead, authenticate, crossZone, enableToken bool) (vol *Vol, err error) { +func (c *Cluster) createVol(name, owner, zoneName, description string, mpCount, dpReplicaNum, size, capacity int, followerRead, authenticate, enableToken, autoRepair bool) (vol *Vol, err error) { var ( dataPartitionSize uint64 readWriteDataPartitions int @@ -1615,21 +2022,13 @@ func (c *Cluster) createVol(name, owner, zoneName, description string, mpCount, } else { dataPartitionSize = uint64(size) * util.GB } - - if crossZone && c.t.zoneLen() <= 1 { - return nil, fmt.Errorf("cluster has one zone,can't cross zone") - } - if crossZone && zoneName != "" { - return nil, fmt.Errorf("only the vol which don't across zones,can specified zoneName") + if err = c.validZone(zoneName, dpReplicaNum); err != nil { + goto errHandler } - if zoneName != "" { - if _, err = c.t.getZone(zoneName); err != nil { - return - } - } else if !crossZone { - zoneName = DefaultZoneName + if vol, err = c.doCreateVol(name, owner, zoneName, description, dataPartitionSize, uint64(capacity), dpReplicaNum, followerRead, authenticate, enableToken, autoRepair); err != nil { + goto errHandler } - if vol, err = c.doCreateVol(name, owner, zoneName, description, dataPartitionSize, uint64(capacity), dpReplicaNum, followerRead, authenticate, crossZone, enableToken); err != nil { + if err = c.validZone(zoneName, int(vol.mpReplicaNum)); err != nil { goto errHandler } if err = vol.initMetaPartitions(c, mpCount); err != nil { @@ -1657,7 +2056,7 @@ errHandler: return } -func (c *Cluster) doCreateVol(name, owner, zoneName, description string, dpSize, capacity uint64, dpReplicaNum int, followerRead, authenticate, crossZone, enableToken bool) (vol *Vol, err error) { +func (c *Cluster) doCreateVol(name, owner, zoneName, description string, dpSize, capacity uint64, dpReplicaNum int, followerRead, authenticate, enableToken, autoRepair bool) (vol *Vol, err error) { var id uint64 c.createVolMutex.Lock() defer c.createVolMutex.Unlock() @@ -1670,7 +2069,8 @@ func (c *Cluster) doCreateVol(name, owner, zoneName, description string, dpSize, if err != nil { goto errHandler } - vol = newVol(id, name, owner, zoneName, dpSize, capacity, uint8(dpReplicaNum), defaultReplicaNum, followerRead, authenticate, crossZone, enableToken, createTime, description) + vol = newVol(id, name, owner, zoneName, dpSize, capacity, uint8(dpReplicaNum), defaultReplicaNum, followerRead, authenticate, enableToken, autoRepair, createTime, description) + // refresh oss secure vol.refreshOSSSecure() if err = c.syncAddVol(vol); err != nil { @@ -1727,100 +2127,153 @@ func (c *Cluster) updateInodeIDRange(volName string, start uint64) (err error) { } // Choose the target hosts from the available zones and meta nodes. -func (c *Cluster) chooseTargetMetaHosts(excludeZone string, excludeNodeSets []uint64, excludeHosts []string, replicaNum int, crossZone bool, specifiedZone string) (hosts []string, peers []proto.Peer, err error) { +func (c *Cluster) chooseTargetMetaHosts(excludeZone string, excludeNodeSets []uint64, excludeHosts []string, replicaNum int, zoneName string) (hosts []string, peers []proto.Peer, err error) { var ( - zones []*Zone - masterZone *Zone + zones []*Zone ) + allocateZoneMap := make(map[*Zone][]string, 0) + hasAllocateNum := 0 excludeZones := make([]string, 0) + hosts = make([]string, 0) + peers = make([]proto.Peer, 0) + if excludeHosts == nil { + excludeHosts = make([]string, 0) + } if excludeZone != "" { excludeZones = append(excludeZones, excludeZone) } - zoneNum := c.decideZoneNum(crossZone) - if replicaNum < zoneNum { - zoneNum = replicaNum - } - // when creating vol,user specified a zone,we reset zoneNum to 1,to be created partition with specified zone, - //if specified zone is not writable,we choose a zone randomly - if specifiedZone != "" { - zoneNum = 1 - zone, err := c.t.getZone(specifiedZone) - if err != nil { - Warn(c.Name, fmt.Sprintf("cluster[%v],specified zone[%v]is not writable", c.Name, specifiedZone)) - } else { - zones = make([]*Zone, 0) - zones = append(zones, zone) - } - } - if zones == nil || specifiedZone == "" { - if zones, err = c.t.allocZonesForMetaNode(zoneNum, replicaNum, excludeZones); err != nil { - return - } - } - - if crossZone && len(zones) < 2 { - log.LogWarn(fmt.Sprintf("action[chooseTargetMetaNodes] ,no enough zones [%v] to be selected, expect select [%v] zones", len(zones), zoneNum)) - return nil, nil, fmt.Errorf("action[chooseTargetMetaNodes] no enough zones [%v] to be selected, expect select [%v] zones", len(zones), zoneNum) + if zones, err = c.t.allocZonesForMetaNode(zoneName, replicaNum, excludeZones); err != nil { + return } - if len(zones) == 1 { + zoneList := strings.Split(zoneName, ",") + if len(zones) == 1 && len(zoneList) == 1 { if hosts, peers, err = zones[0].getAvailMetaNodeHosts(excludeNodeSets, excludeHosts, replicaNum); err != nil { log.LogErrorf("action[chooseTargetMetaNodes],err[%v]", err) return } - return + goto result } - hosts = make([]string, 0) - peers = make([]proto.Peer, 0) - if excludeHosts == nil { - excludeHosts = make([]string, 0) + if len(zones) == 2 { + switch c.lastPermutationsForZone % 2 { + case 0: + zones = append(make([]*Zone, 0), zones[0], zones[1]) + c.lastPermutationsForZone = (c.lastPermutationsForZone + 1) % 6 + default: + zones = append(make([]*Zone, 0), zones[1], zones[0]) + c.lastPermutationsForZone = (c.lastPermutationsForZone + 1) % 6 + } } - //replicaNum is equal with the number of allocated zones - if replicaNum == len(zones) { + for hasAllocateNum < replicaNum { + localExcludeHosts := excludeHosts for _, zone := range zones { - selectedHosts, selectedPeers, e := zone.getAvailMetaNodeHosts(excludeNodeSets, excludeHosts, 1) + localExcludeHosts = append(localExcludeHosts, allocateZoneMap[zone]...) + selectedHosts, selectedPeers, e := zone.getAvailMetaNodeHosts(excludeNodeSets, localExcludeHosts, 1) if e != nil { return nil, nil, errors.NewError(e) } hosts = append(hosts, selectedHosts...) peers = append(peers, selectedPeers...) + allocateZoneMap[zone] = append(allocateZoneMap[zone], selectedHosts...) + hasAllocateNum = hasAllocateNum + 1 + if hasAllocateNum == replicaNum { + break + } } - goto result } + goto result +result: + log.LogInfof("action[chooseTargetMetaHosts] replicaNum[%v],zoneName[%v],selectedZones[%v],hosts[%v]", replicaNum, zoneName, zones, hosts) + if len(hosts) != replicaNum { + return nil, nil, errors.Trace(proto.ErrNoMetaNodeToCreateMetaPartition, "hosts len[%v],replicaNum[%v]", len(hosts), replicaNum) + } + return +} - // replicaNum larger than with the number of allocated zones - for _, zone := range zones { - if zone.name != c.lastMasterZoneForMetaNode { - masterZone = zone - c.lastMasterZoneForMetaNode = zone.name - break +func (c *Cluster) chooseTargetMetaHostForDecommission(excludeZone string, mp *MetaPartition, excludeHosts []string, replicaNum int, zoneName string) (hosts []string, peers []proto.Peer, err error) { + var zones []*Zone + var targetZone *Zone + zones = make([]*Zone, 0) + zoneList := strings.Split(zoneName, ",") + for _, z := range zoneList { + var zone *Zone + if zone, err = c.t.getZone(z); err != nil { + return } + zones = append(zones, zone) + } - if masterZone == nil { - masterZone = zones[0] + //if not cross zone, choose a zone from all zones + if len(zoneList) == 1 { + zones = c.t.getAllZones() } - for _, zone := range zones { - if zone.name == masterZone.name { - rNum := replicaNum - len(zones) + 1 - selectedHosts, selectedPeers, e := zone.getAvailMetaNodeHosts(excludeNodeSets, excludeHosts, rNum) - if e != nil { - return nil, nil, errors.NewError(e) + demandWriteNodes := 1 + candidateZones := make([]*Zone, 0) + for _, z := range zones { + if z.status == unavailableZone { + continue + } + if excludeZone == z.name { + continue + } + if z.canWriteForMetaNode(uint8(demandWriteNodes)) { + candidateZones = append(candidateZones, z) + } + } + //must have a candidate zone + if len(candidateZones) < 1 { + log.LogError(fmt.Sprintf("action[allocZonesForMetaNode],there are no candidateZones, demandWriteNodes[%v], err:%v", + demandWriteNodes, proto.ErrNoZoneToCreateMetaPartition)) + return nil, nil, proto.ErrNoZoneToCreateMetaPartition + } + if len(zoneList) == 1 { + for index, zone := range candidateZones { + if c.lastMasterZoneForMetaNode == "" { + targetZone = zone + c.lastMasterZoneForMetaNode = targetZone.name + break } - hosts = append(hosts, selectedHosts...) - peers = append(peers, selectedPeers...) - } else { - selectedHosts, selectedPeers, e := zone.getAvailMetaNodeHosts(excludeNodeSets, excludeHosts, 1) - if e != nil { - return nil, nil, errors.NewError(e) + if zone.name == c.lastMasterZoneForMetaNode { + if index == len(candidateZones)-1 { + targetZone = candidateZones[0] + } else { + targetZone = candidateZones[index+1] + } + c.lastMasterZoneForMetaNode = targetZone.name + break } - hosts = append(hosts, selectedHosts...) - peers = append(peers, selectedPeers...) + } + if targetZone == nil { + targetZone = candidateZones[0] + c.lastMasterZoneForMetaNode = targetZone.name } } -result: - log.LogInfof("action[chooseTargetMetaHosts] replicaNum[%v],zoneNum[%v],selectedZones[%v],hosts[%v]", replicaNum, zoneNum, len(zones), hosts) - if len(hosts) != replicaNum { - return nil, nil, errors.Trace(proto.ErrNoMetaNodeToCreateMetaPartition, "hosts len[%v],replicaNum[%v]", len(hosts), replicaNum) + if len(zoneList) > 1 { + var curZonesMap map[string]uint8 + if curZonesMap, err = mp.getMetaZoneMap(c); err != nil { + return + } + //avoid change from 2 zones to 1 zone after decommission + if len(curZonesMap) == 2 && curZonesMap[excludeZone] == 1 { + for k := range curZonesMap { + if k == excludeZone { + continue + } + for _, z := range candidateZones { + if z.name == k { + continue + } + targetZone = z + } + } + } else { + targetZone = candidateZones[0] + } + } + if targetZone == nil { + err = fmt.Errorf("no candidate zones available") + return } + hosts, peers, err = targetZone.getAvailMetaNodeHosts(nil, excludeHosts, 1) return } @@ -2009,3 +2462,127 @@ func (c *Cluster) clearMetaNodes() { return true }) } + +func (c *Cluster) setDataNodeToOfflineState(startID, endID uint64, state bool, zoneName string) { + c.dataNodes.Range(func(key, value interface{}) bool { + node, ok := value.(*DataNode) + if !ok { + return true + } + if node.ID < startID || node.ID > endID { + return true + } + if node.ZoneName != zoneName { + return true + } + node.Lock() + node.ToBeMigrated = state + node.Unlock() + return true + }) +} + +func (c *Cluster) setMetaNodeToOfflineState(startID, endID uint64, state bool, zoneName string) { + c.metaNodes.Range(func(key, value interface{}) bool { + node, ok := value.(*MetaNode) + if !ok { + return true + } + if node.ID < startID || node.ID > endID { + return true + } + if node.ZoneName != zoneName { + return true + } + node.Lock() + node.ToBeMigrated = state + node.Unlock() + return true + }) +} +func (c *Cluster) setDpRecoverPoolSize(dpRecoverPool int32) (err error) { + oldDpPool := atomic.LoadInt32(&c.cfg.DataPartitionsRecoverPoolSize) + atomic.StoreInt32(&c.cfg.DataPartitionsRecoverPoolSize, dpRecoverPool) + + if err = c.syncPutCluster(); err != nil { + log.LogErrorf("action[setDpRecoverPoolSize] err[%v]", err) + atomic.StoreInt32(&c.cfg.DataPartitionsRecoverPoolSize, oldDpPool) + err = proto.ErrPersistenceByRaft + return + } + c.initDpRepairChan() + return +} + +func (c *Cluster) setMpRecoverPoolSize(mpRecoverPool int32) (err error) { + oldMpPool := atomic.LoadInt32(&c.cfg.MetaPartitionsRecoverPoolSize) + atomic.StoreInt32(&c.cfg.MetaPartitionsRecoverPoolSize, mpRecoverPool) + + if err = c.syncPutCluster(); err != nil { + log.LogErrorf("action[setMpRecoverPoolSize] err[%v]", err) + atomic.StoreInt32(&c.cfg.MetaPartitionsRecoverPoolSize, oldMpPool) + err = proto.ErrPersistenceByRaft + return + } + c.initMpRepairChan() + return +} + +func (c *Cluster) initDpRepairChan() { + var chanCapacity int32 + chanCapacity = c.cfg.DataPartitionsRecoverPoolSize + if chanCapacity > maxDataPartitionsRecoverPoolSize { + chanCapacity = maxDataPartitionsRecoverPoolSize + } + if chanCapacity < 1 { + chanCapacity = 1 + } + c.dpRepairChan = make(chan *RepairTask, chanCapacity) +} + +func (c *Cluster) initMpRepairChan() { + var chanCapacity int32 + chanCapacity = c.cfg.MetaPartitionsRecoverPoolSize + if chanCapacity > maxMetaPartitionsRecoverPoolSize { + chanCapacity = maxMetaPartitionsRecoverPoolSize + } + if chanCapacity < 1 { + chanCapacity = 1 + } + c.mpRepairChan = make(chan *RepairTask, chanCapacity) +} + +func (c *Cluster) sendRepairMetaPartitionTask(mp *MetaPartition, rType RepairType) (err error) { + var repairTask *RepairTask + repairTask = &RepairTask{ + RType: rType, + Pid: mp.PartitionID, + } + select { + case c.mpRepairChan <- repairTask: + Warn(c.Name, fmt.Sprintf("action[sendRepairMetaPartitionTask] clusterID[%v] vol[%v] meta partition[%v] "+ + "task type[%v]", c.Name, mp.volName, mp.PartitionID, rType)) + default: + Warn(c.Name, fmt.Sprintf("action[sendRepairMetaPartitionTask] clusterID[%v] vol[%v] meta partition[%v] "+ + "task type[%v], mpRepairChan has been full", c.Name, mp.volName, mp.PartitionID, rType)) + } + return +} + +func (c *Cluster) sendRepairDataPartitionTask(dp *DataPartition, rType RepairType) (err error) { + var repairTask *RepairTask + repairTask = &RepairTask{ + RType: rType, + Pid: dp.PartitionID, + } + select { + case c.dpRepairChan <- repairTask: + Warn(c.Name, fmt.Sprintf("action[sendRepairDataPartitionTask] clusterID[%v] vol[%v] data partition[%v] "+ + "task type[%v]", c.Name, dp.VolName, dp.PartitionID, rType)) + default: + Warn(c.Name, fmt.Sprintf("action[sendRepairDataPartitionTask] clusterID[%v] vol[%v] data partition[%v] "+ + "task type[%v], chanLength[%v], chanCapacity[%v], dpRepairChan has been full", c.Name, dp.VolName, dp.PartitionID, rType, len(c.dpRepairChan), + cap(c.dpRepairChan))) + } + return +} diff --git a/master/cluster_task.go b/master/cluster_task.go index 3e4955b59f..71b7c7031d 100644 --- a/master/cluster_task.go +++ b/master/cluster_task.go @@ -26,6 +26,8 @@ import ( "github.com/chubaofs/chubaofs/util/log" ) +type ChooseMetaHostFunc func(c *Cluster, nodeAddr string, mp *MetaPartition, oldHosts []string, excludeNodeSets []uint64, zoneName string) (oldAddr, addAddr string, err error) + func (c *Cluster) addDataNodeTasks(tasks []*proto.AdminTask) { for _, t := range tasks { c.addDataNodeTask(t) @@ -84,7 +86,9 @@ func (c *Cluster) loadDataPartition(dp *DataPartition) { }() } -// taking the given mata partition offline. +// taking the given mata partition offline. In strict mode, only if the size of the replica is equal, +// or the number of files is equal, the recovery is considered complete. when it is triggered by migrated metaNode, +// the strict mode is true,otherwise is false. // 1. checking if the meta partition can be offline. // There are two cases where the partition is not allowed to be offline: // (1) the replica is not in the latest host list @@ -93,36 +97,73 @@ func (c *Cluster) loadDataPartition(dp *DataPartition) { // 3. synchronized decommission meta partition // 4. synchronized create a new meta partition // 5. persistent the new host list -func (c *Cluster) decommissionMetaPartition(nodeAddr string, mp *MetaPartition) (err error) { +func (c *Cluster) decommissionMetaPartition(nodeAddr string, mp *MetaPartition, chooseMetaHostFunc ChooseMetaHostFunc, strictMode bool) (err error) { var ( - newPeers []proto.Peer - metaNode *MetaNode - zone *Zone - ns *nodeSet + addAddr string excludeNodeSets []uint64 oldHosts []string - zones []string - excludeZone string + vol *Vol ) + mp.offlineMutex.Lock() + defer mp.offlineMutex.Unlock() + oldHosts = mp.Hosts + if vol, err = c.getVol(mp.volName); err != nil { + goto errHandler + } + if nodeAddr, addAddr, err = chooseMetaHostFunc(c, nodeAddr, mp, oldHosts, excludeNodeSets, vol.zoneName); err != nil { + goto errHandler + } + log.LogWarnf("action[decommissionMetaPartition],volName[%v],nodeAddr[%v],partitionID[%v] begin", mp.volName, nodeAddr, mp.PartitionID) - mp.RLock() - if !contains(mp.Hosts, nodeAddr) { - mp.RUnlock() - return + if err = c.deleteMetaReplica(mp, nodeAddr, false, strictMode); err != nil { + goto errHandler } - oldHosts = mp.Hosts + if err = c.addMetaReplica(mp, addAddr); err != nil { + goto errHandler + } + mp.IsRecover = true + if strictMode { + c.putMigratedMetaPartitions(nodeAddr, mp.PartitionID) + } else { + c.putBadMetaPartitions(nodeAddr, mp.PartitionID) + } + mp.RLock() + c.syncUpdateMetaPartition(mp) mp.RUnlock() + return +errHandler: + log.LogError(fmt.Sprintf("action[decommissionMetaPartition],volName: %v,partitionID: %v,err: %v", + mp.volName, mp.PartitionID, errors.Stack(err))) + Warn(c.Name, fmt.Sprintf("clusterID[%v] meta partition[%v] offline addr[%v] failed,err:%v", + c.Name, mp.PartitionID, nodeAddr, err)) + if err != nil { + err = fmt.Errorf("vol[%v],partition[%v],err[%v]", mp.volName, mp.PartitionID, err) + } + return +} + +var getTargetAddressForMetaPartitionDecommission = func(c *Cluster, nodeAddr string, mp *MetaPartition, oldHosts []string, excludeNodeSets []uint64, zoneName string) (oldAddr, addAddr string, err error) { + var ( + metaNode *MetaNode + zone *Zone + zones []string + ns *nodeSet + newPeers []proto.Peer + excludeZone string + ) + oldAddr = nodeAddr + if err = c.validateDecommissionMetaPartition(mp, nodeAddr); err != nil { - goto errHandler + return } if metaNode, err = c.metaNode(nodeAddr); err != nil { - goto errHandler + return } if zone, err = c.t.getZone(metaNode.ZoneName); err != nil { - goto errHandler + return } if ns, err = zone.getNodeSet(metaNode.NodeSetID); err != nil { - goto errHandler + return } if _, newPeers, err = ns.getAvailMetaNodeHosts(oldHosts, 1); err != nil { // choose a meta node in other node set in the same zone @@ -135,33 +176,13 @@ func (c *Cluster) decommissionMetaPartition(nodeAddr string, mp *MetaPartition) excludeZone = zones[0] } // choose a meta node in other zone - if _, newPeers, err = c.chooseTargetMetaHosts(excludeZone, excludeNodeSets, oldHosts, 1, false, ""); err != nil { - goto errHandler + if _, newPeers, err = c.chooseTargetMetaHostForDecommission(excludeZone, mp, oldHosts, 1, zoneName); err != nil { + return } } } - if err = c.deleteMetaReplica(mp, nodeAddr, false); err != nil { - goto errHandler - } - if err = c.addMetaReplica(mp, newPeers[0].Addr); err != nil { - goto errHandler - } - mp.IsRecover = true - c.putBadMetaPartitions(nodeAddr, mp.PartitionID) - mp.RLock() - c.syncUpdateMetaPartition(mp) - mp.RUnlock() - Warn(c.Name, fmt.Sprintf("action[decommissionMetaPartition] clusterID[%v] vol[%v] meta partition[%v] "+ - "offline addr[%v] success,new addr[%v]", c.Name, mp.volName, mp.PartitionID, nodeAddr, newPeers[0].Addr)) - return - -errHandler: - log.LogError(fmt.Sprintf("action[decommissionMetaPartition],volName: %v,partitionID: %v,err: %v", - mp.volName, mp.PartitionID, errors.Stack(err))) - Warn(c.Name, fmt.Sprintf("clusterID[%v] meta partition[%v] offline addr[%v] failed,err:%v", - c.Name, mp.PartitionID, nodeAddr, err)) - if err != nil { - err = fmt.Errorf("vol[%v],partition[%v],err[%v]", mp.volName, mp.PartitionID, err) + if len(newPeers) > 0 { + addAddr = newPeers[0].Addr } return } @@ -170,6 +191,11 @@ func (c *Cluster) validateDecommissionMetaPartition(mp *MetaPartition, nodeAddr mp.RLock() defer mp.RUnlock() var vol *Vol + if !contains(mp.Hosts, nodeAddr) { + err = fmt.Errorf("offline address:[%v] is not in meta partition hosts:%v", nodeAddr, mp.Hosts) + return + } + if vol, err = c.getVol(mp.volName); err != nil { return } @@ -177,11 +203,11 @@ func (c *Cluster) validateDecommissionMetaPartition(mp *MetaPartition, nodeAddr return } - if err = mp.hasMissingOneReplica(int(vol.mpReplicaNum)); err != nil { + if err = mp.hasMissingOneReplica(nodeAddr, int(vol.mpReplicaNum)); err != nil { return } - if mp.IsRecover { + if mp.IsRecover && !mp.isLatestReplica(nodeAddr) { err = fmt.Errorf("vol[%v],meta partition[%v] is recovering,[%v] can't be decommissioned", vol.Name, mp.PartitionID, nodeAddr) return } @@ -270,7 +296,7 @@ func (c *Cluster) checkLackReplicaMetaPartitions() (lackReplicaMetaPartitions [] return } -func (c *Cluster) deleteMetaReplica(partition *MetaPartition, addr string, validate bool) (err error) { +func (c *Cluster) deleteMetaReplica(partition *MetaPartition, addr string, validate, migrationMode bool) (err error) { defer func() { if err != nil { log.LogErrorf("action[deleteMetaReplica],vol[%v],data partition[%v],err[%v]", partition.volName, partition.PartitionID, err) @@ -286,16 +312,16 @@ func (c *Cluster) deleteMetaReplica(partition *MetaPartition, addr string, valid return } removePeer := proto.Peer{ID: metaNode.ID, Addr: addr} - if err = c.removeMetaPartitionRaftMember(partition, removePeer); err != nil { + if err = c.removeMetaPartitionRaftMember(partition, removePeer, migrationMode); err != nil { return } - if err = c.deleteMetaPartition(partition, metaNode); err != nil { + if err = c.deleteMetaPartition(partition, metaNode, migrationMode); err != nil { return } return } -func (c *Cluster) deleteMetaPartition(partition *MetaPartition, removeMetaNode *MetaNode) (err error) { +func (c *Cluster) deleteMetaPartition(partition *MetaPartition, removeMetaNode *MetaNode, migrationMode bool) (err error) { partition.Lock() mr, err := partition.getMetaReplica(removeMetaNode.Addr) if err != nil { @@ -306,6 +332,9 @@ func (c *Cluster) deleteMetaPartition(partition *MetaPartition, removeMetaNode * partition.removeReplicaByAddr(removeMetaNode.Addr) partition.removeMissingReplica(removeMetaNode.Addr) partition.Unlock() + if migrationMode { + return + } _, err = removeMetaNode.Sender.syncSendAdminTask(task) if err != nil { log.LogErrorf("action[deleteMetaPartition] vol[%v],data partition[%v],err[%v]", partition.volName, partition.PartitionID, err) @@ -313,10 +342,9 @@ func (c *Cluster) deleteMetaPartition(partition *MetaPartition, removeMetaNode * return nil } -func (c *Cluster) removeMetaPartitionRaftMember(partition *MetaPartition, removePeer proto.Peer) (err error) { - partition.offlineMutex.Lock() - defer partition.offlineMutex.Unlock() - defer func(){ + +func (c *Cluster) removeMetaPartitionRaftMember(partition *MetaPartition, removePeer proto.Peer, migrationMode bool) (err error) { + defer func() { if err1 := c.updateMetaPartitionOfflinePeerIDWithLock(partition, 0); err1 != nil { err = errors.Trace(err, "updateMetaPartitionOfflinePeerIDWithLock failed, err[%v]", err1) } @@ -332,6 +360,7 @@ func (c *Cluster) removeMetaPartitionRaftMember(partition *MetaPartition, remove if err != nil { return } + t.ReserveResource = migrationMode var leaderMetaNode *MetaNode leaderMetaNode = mr.metaNode if leaderMetaNode == nil { @@ -357,9 +386,12 @@ func (c *Cluster) removeMetaPartitionRaftMember(partition *MetaPartition, remove } newPeers = append(newPeers, peer) } + partition.Lock() if err = partition.persistToRocksDB("removeMetaPartitionRaftMember", partition.volName, newHosts, newPeers, c); err != nil { + partition.Unlock() return } + partition.Unlock() if mr.Addr != removePeer.Addr { return } @@ -373,7 +405,7 @@ func (c *Cluster) removeMetaPartitionRaftMember(partition *MetaPartition, remove return } -func (c *Cluster) updateMetaPartitionOfflinePeerIDWithLock(mp *MetaPartition, peerID uint64) (err error){ +func (c *Cluster) updateMetaPartitionOfflinePeerIDWithLock(mp *MetaPartition, peerID uint64) (err error) { mp.Lock() defer mp.Unlock() mp.OfflinePeerID = peerID @@ -569,7 +601,7 @@ func (c *Cluster) doLoadDataPartition(dp *DataPartition) { dp.getFileCount() dp.validateCRC(c.Name) - dp.checkReplicaSize(c.Name,c.cfg.diffSpaceUsage) + dp.checkReplicaSize(c.Name, c.cfg.diffSpaceUsage) dp.setToNormal() } diff --git a/master/cluster_test.go b/master/cluster_test.go index 763e640f77..7c28f1cf73 100644 --- a/master/cluster_test.go +++ b/master/cluster_test.go @@ -20,8 +20,9 @@ func buildPanicVol() *Vol { return nil } var createTime = time.Now().Unix() // record create time of this volume - vol := newVol(id, commonVol.Name, commonVol.Owner, "", commonVol.dataPartitionSize, commonVol.Capacity, - defaultReplicaNum, defaultReplicaNum, false, false, false, false, createTime, "") + vol := newVol(id, commonVol.Name, commonVol.Owner, testZone1+","+testZone2, commonVol.dataPartitionSize, commonVol.Capacity, + defaultReplicaNum, defaultReplicaNum, false, false, true, false, createTime, "") + vol.dataPartitions = nil return vol } @@ -116,6 +117,39 @@ func TestPanicCheckBadDiskRecovery(t *testing.T) { c.scheduleToCheckDiskRecoveryProgress() } +func TestPanicCheckMigratedDataPartitionsRecovery(t *testing.T) { + c := buildPanicCluster() + vol, err := c.getVol(commonVolName) + if err != nil { + t.Error(err) + } + partitionID, err := server.cluster.idAlloc.allocateDataPartitionID() + if err != nil { + t.Error(err) + } + dp := newDataPartition(partitionID, vol.dpReplicaNum, vol.Name, vol.ID) + c.MigratedDataPartitionIds.Store(fmt.Sprintf("%v", dp.PartitionID), dp) + c.checkMigratedDataPartitionsRecoveryProgress() +} + +func TestPanicCheckMigratedMetaPartitionsRecovery(t *testing.T) { + c := buildPanicCluster() + vol, err := c.getVol(commonVolName) + if err != nil { + t.Error(err) + } + partitionID, err := server.cluster.idAlloc.allocateMetaPartitionID() + if err != nil { + t.Error(err) + } + mp := newMetaPartition(partitionID, 1, defaultMaxMetaPartitionInodeID, vol.mpReplicaNum, vol.Name, vol.ID) + vol.addMetaPartition(mp) + c.MigratedMetaPartitionIds.Store(fmt.Sprintf("%v", mp.PartitionID), mp) + mp = nil + c.checkMigratedMetaPartitionRecoveryProgress() + t.Logf("catched panic") +} + func TestCheckBadDiskRecovery(t *testing.T) { server.cluster.checkDataNodeHeartbeat() time.Sleep(5 * time.Second) diff --git a/master/config.go b/master/config.go index ec8a081e82..cbc574abf8 100644 --- a/master/config.go +++ b/master/config.go @@ -64,7 +64,8 @@ const ( defaultMetaPartitionMemUsageThreshold float32 = 0.75 // memory usage threshold on a meta partition defaultMaxMetaPartitionCountOnEachNode = 10000 defaultReplicaNum = 3 - defaultDiffSpaceUsage = 1024 * 1024 * 1024 + defaultDiffSpaceUsage = 10 * 1024 * 1024 * 1024 + defaultCrossZoneNum = 3 ) // AddrDatabase is a map that stores the address of a given host (e.g., the leader) @@ -92,6 +93,8 @@ type clusterConfig struct { heartbeatPort int64 replicaPort int64 diffSpaceUsage uint64 + DataPartitionsRecoverPoolSize int32 + MetaPartitionsRecoverPoolSize int32 } func newClusterConfig() (cfg *clusterConfig) { @@ -108,6 +111,8 @@ func newClusterConfig() (cfg *clusterConfig) { cfg.MetaNodeThreshold = defaultMetaPartitionMemUsageThreshold cfg.metaNodeReservedMem = defaultMetaNodeReservedMem cfg.diffSpaceUsage = defaultDiffSpaceUsage + cfg.DataPartitionsRecoverPoolSize = defaultRecoverPoolSize + cfg.MetaPartitionsRecoverPoolSize = defaultRecoverPoolSize return } diff --git a/master/const.go b/master/const.go index d87b6687a0..7d21df6d8a 100644 --- a/master/const.go +++ b/master/const.go @@ -29,6 +29,10 @@ const ( idKey = "id" countKey = "count" startKey = "start" + endKey = "end" + nodeTypeKey = "nodeType" + strictFlagKey = "strict" + stateKey = "state" enableKey = "enable" thresholdKey = "threshold" dataPartitionSizeKey = "size" @@ -42,7 +46,7 @@ const ( akKey = "ak" keywordsKey = "keywords" zoneNameKey = "zoneName" - crossZoneKey = "crossZone" + autoRepairKey = "autoRepair" tokenKey = "token" tokenTypeKey = "tokenType" enableTokenKey = "enableToken" @@ -55,6 +59,14 @@ const ( descriptionKey = "description" dpSelectorNameKey = "dpSelectorName" dpSelectorParmKey = "dpSelectorParm" + dpRecoverPoolSizeKey = "dpRecoverPool" + mpRecoverPoolSizeKey = "mpRecoverPool" +) + +const ( + nodeTypeDataNode = "dataNode" + nodeTypeMetaNode = "metaNode" + nodeTypeAll = "all" ) const ( @@ -64,6 +76,7 @@ const ( dataNodeOfflineErr = "dataNodeOfflineErr " diskOfflineErr = "diskOfflineErr " handleDataPartitionOfflineErr = "handleDataPartitionOffLineErr " + balanceDataPartitionZoneErr = "balanceDataPartitionZoneErr " ) const ( @@ -96,6 +109,10 @@ const ( retrySendSyncTaskInternal = 3 * time.Second defaultRangeOfCountDifferencesAllowed = 50 defaultMinusOfMaxInodeID = 1000 + defaultPercentMinusOfInodeCount = 0.20 + defaultRecoverPoolSize = -1 + maxDataPartitionsRecoverPoolSize = 50 + maxMetaPartitionsRecoverPoolSize = 30 ) const ( diff --git a/master/data_node.go b/master/data_node.go index 463e82ab1d..57a627f8db 100644 --- a/master/data_node.go +++ b/master/data_node.go @@ -25,18 +25,18 @@ import ( // DataNode stores all the information about a data node type DataNode struct { - Total uint64 `json:"TotalWeight"` - Used uint64 `json:"UsedWeight"` + Total uint64 `json:"TotalWeight"` + Used uint64 `json:"UsedWeight"` AvailableSpace uint64 ID uint64 - ZoneName string `json:"Zone"` + ZoneName string `json:"Zone"` Addr string ReportTime time.Time isActive bool - sync.RWMutex `graphql:"-"` - UsageRatio float64 // used / total space - SelectedTimes uint64 // number times that this datanode has been selected as the location for a data partition. - Carry float64 // carry is a factor used in cacluate the node's weight + sync.RWMutex `graphql:"-"` + UsageRatio float64 // used / total space + SelectedTimes uint64 // number times that this datanode has been selected as the location for a data partition. + Carry float64 // carry is a factor used in cacluate the node's weight TaskManager *AdminTaskManager `graphql:"-"` DataPartitionReports []*proto.PartitionReport DataPartitionCount uint32 @@ -44,6 +44,7 @@ type DataNode struct { PersistenceDataPartitions []uint64 BadDisks []string ToBeOffline bool + ToBeMigrated bool } func newDataNode(addr, zoneName, clusterID string) (dataNode *DataNode) { @@ -102,7 +103,8 @@ func (dataNode *DataNode) isWriteAble() (ok bool) { dataNode.RLock() defer dataNode.RUnlock() - if dataNode.isActive == true && dataNode.AvailableSpace > 10*util.GB { + if dataNode.isActive == true && dataNode.AvailableSpace > 10*util.GB && + dataNode.ToBeOffline == false && dataNode.ToBeMigrated == false { ok = true } diff --git a/master/data_partition.go b/master/data_partition.go index 3a83143cae..99a1bed123 100644 --- a/master/data_partition.go +++ b/master/data_partition.go @@ -98,6 +98,29 @@ func (partition *DataPartition) tryToChangeLeader(c *Cluster, dataNode *DataNode return } +func (partition *DataPartition) prepareAddRaftMember(addPeer proto.Peer) (leaderAddr string, candidateAddrs []string, err error) { + partition.RLock() + defer partition.RUnlock() + if contains(partition.Hosts, addPeer.Addr) { + err = fmt.Errorf("vol[%v],data partition[%v] has contains host[%v]", partition.VolName, partition.PartitionID, addPeer.Addr) + return + } + candidateAddrs = make([]string, 0, len(partition.Hosts)) + leaderAddr = partition.getLeaderAddr() + if leaderAddr != "" && contains(partition.Hosts, leaderAddr) { + candidateAddrs = append(candidateAddrs, leaderAddr) + } else { + leaderAddr = "" + } + for _, host := range partition.Hosts { + if host == leaderAddr { + continue + } + candidateAddrs = append(candidateAddrs, host) + } + return +} + func (partition *DataPartition) createTaskToTryToChangeLeader(addr string) (task *proto.AdminTask, err error) { task = proto.NewAdminTask(proto.OpDataPartitionTryToLeader, addr, nil) partition.resetTaskID(task) @@ -141,9 +164,20 @@ func (partition *DataPartition) resetTaskID(t *proto.AdminTask) { } // Check if there is a replica missing or not. -func (partition *DataPartition) hasMissingOneReplica(replicaNum int) (err error) { - hostNum := len(partition.Replicas) - if hostNum <= replicaNum-1 { +func (partition *DataPartition) hasMissingOneReplica(offlineAddr string, replicaNum int) (err error) { + curHostCount := len(partition.Hosts) + for _, host := range partition.Hosts { + if host == offlineAddr { + curHostCount = curHostCount - 1 + } + } + curReplicaCount := len(partition.Replicas) + for _, r := range partition.Replicas { + if r.Addr == offlineAddr { + curReplicaCount = curReplicaCount - 1 + } + } + if curReplicaCount < replicaNum-1 || curHostCount < replicaNum-1 { log.LogError(fmt.Sprintf("action[%v],partitionID:%v,err:%v", "hasMissingOneReplica", partition.PartitionID, proto.ErrHasOneMissingReplica)) err = proto.ErrHasOneMissingReplica @@ -596,8 +630,9 @@ func (partition *DataPartition) containsBadDisk(diskPath string, nodeAddr string } func (partition *DataPartition) getMinus() (minus float64) { - partition.RLock() - defer partition.RUnlock() + if len(partition.Replicas) == 0 { + return + } used := partition.Replicas[0].Used for _, replica := range partition.Replicas { if math.Abs(float64(replica.Used)-float64(used)) > minus { @@ -607,6 +642,23 @@ func (partition *DataPartition) getMinus() (minus float64) { return minus } +func (partition *DataPartition) getMinusOfFileCount() (minus float64) { + partition.RLock() + defer partition.RUnlock() + var sentry float64 + for index, replica := range partition.Replicas { + if index == 0 { + sentry = float64(replica.FileCount) + continue + } + diff := math.Abs(float64(replica.FileCount) - sentry) + if diff > minus { + minus = diff + } + } + return +} + func (partition *DataPartition) getToBeDecommissionHost(replicaNum int) (host string) { partition.RLock() defer partition.RUnlock() @@ -619,7 +671,9 @@ func (partition *DataPartition) getToBeDecommissionHost(replicaNum int) (host st } func (partition *DataPartition) removeOneReplicaByHost(c *Cluster, host string) (err error) { - if err = c.removeDataReplica(partition, host, false); err != nil { + partition.offlineMutex.Lock() + defer partition.offlineMutex.Unlock() + if err = c.removeDataReplica(partition, host, false, false); err != nil { return } partition.RLock() @@ -683,3 +737,246 @@ func (partition *DataPartition) ToProto(c *Cluster) *proto.DataPartitionInfo { FilesWithMissingReplica: partition.FilesWithMissingReplica, } } + +func (partition *DataPartition) isLatestReplica(addr string) (ok bool) { + hostsLen := len(partition.Hosts) + if hostsLen <= 1 { + return + } + latestAddr := partition.Hosts[hostsLen-1] + return latestAddr == addr +} + +func (partition *DataPartition) isDataCatchUp() (ok bool) { + partition.RLock() + defer partition.RUnlock() + minus := partition.getMinus() + return minus < util.GB +} + +func (partition *DataPartition) isDataCatchUpInStrictMode() (ok bool) { + partition.RLock() + defer partition.RUnlock() + minus := partition.getMinus() + if partition.used > 10*util.GB { + if minus < util.GB { + return true + } + } else if partition.used > util.GB { + if minus < 500*util.MB { + return true + } + } else { + if partition.used == 0 { + return true + } + percent := minus / float64(partition.used) + if partition.used > util.MB { + if percent < 0.5 { + return true + } + } else { + if percent < 0.7 { + return true + } + } + } + return false +} + +//check if the data partition needs to rebalance zone +func (partition *DataPartition) needToRebalanceZone(c *Cluster, zoneList []string) (isNeed bool, err error) { + var curZoneMap map[string]uint8 + var curZoneList []string + curZoneList = make([]string, 0) + curZoneMap = make(map[string]uint8, 0) + if curZoneMap, err = partition.getDataZoneMap(c); err != nil { + return + } + for k := range curZoneMap { + curZoneList = append(curZoneList, k) + } + log.LogDebugf("action[needToRebalanceZone],data partitionID:%v,zone name:%v,current zones[%v]", + partition.PartitionID, zoneList, curZoneList) + if (len(zoneList) == 1 && len(curZoneMap) == 1) || (len(curZoneMap) == 2 && (len(zoneList) == 2 || len(zoneList) == 3)) { + isNeed = false + for zone := range curZoneMap { + if !contains(zoneList, zone) { + isNeed = true + return + } + } + return + } + isNeed = true + return +} + +var getTargetAddressForBalanceDataPartitionZone = func(c *Cluster, offlineAddr string, dp *DataPartition, excludeNodeSets []uint64, zoneName string, destZone string) (oldAddr, newAddr string, err error) { + var ( + offlineZoneName string + targetZoneName string + targetZone *Zone + nodesetInTargetZone *nodeSet + addrInTargetZone string + targetHosts []string + ) + if offlineZoneName, targetZoneName, err = dp.getOfflineAndTargetZone(c, zoneName); err != nil { + return + } + if offlineZoneName == "" || targetZoneName == "" { + err = fmt.Errorf("getOfflineAndTargetZone error, offlineZone[%v], targetZone[%v]", offlineZoneName, targetZoneName) + return + } + if targetZone, err = c.t.getZone(targetZoneName); err != nil { + return + } + if oldAddr, err = dp.getAddressByZoneName(c, offlineZoneName); err != nil { + return + } + if oldAddr == "" { + err = fmt.Errorf("can not find address to decommission") + return + } + if err = c.validateDecommissionDataPartition(dp, oldAddr); err != nil { + return + } + if addrInTargetZone, err = dp.getAddressByZoneName(c, targetZone.name); err != nil { + return + } + //if there is no replica in target zone, choose random nodeset in target zone + if addrInTargetZone == "" { + if targetHosts, _, err = targetZone.getAvailDataNodeHosts(nil, dp.Hosts, 1); err != nil { + return + } + if len(targetHosts) == 0 { + err = fmt.Errorf("no available space to find a target address") + return + } + newAddr = targetHosts[0] + return + } + //if there is a replica in target zone, choose the same nodeset with this replica + var targetNode *DataNode + if targetNode, err = c.dataNode(addrInTargetZone); err != nil { + return + } + if nodesetInTargetZone, err = targetZone.getNodeSet(targetNode.NodeSetID); err != nil { + return + } + if targetHosts, _, err = nodesetInTargetZone.getAvailDataNodeHosts(dp.Hosts, 1); err != nil { + // select data nodes from the other node set in same zone + excludeNodeSets = append(excludeNodeSets, nodesetInTargetZone.ID) + if targetHosts, _, err = targetZone.getAvailDataNodeHosts(excludeNodeSets, dp.Hosts, 1); err != nil { + return + } + } + if len(targetHosts) == 0 { + err = fmt.Errorf("no available space to find a target address") + return + } + newAddr = targetHosts[0] + log.LogInfof("action[balanceZone],data partitionID:%v,zone name:[%v],old address:[%v], new address:[%v]", + dp.PartitionID, zoneName, oldAddr, newAddr) + return +} + +// +func (partition *DataPartition) getOfflineAndTargetZone(c *Cluster, zoneName string) (offlineZone, targetZone string, err error) { + zoneList := strings.Split(zoneName, ",") + var currentZoneList []string + switch len(zoneList) { + case 1: + zoneList = append(make([]string, 0), zoneList[0], zoneList[0], zoneList[0]) + case 2: + switch partition.PartitionID % 2 { + case 0: + zoneList = append(make([]string, 0), zoneList[0], zoneList[0], zoneList[1]) + default: + zoneList = append(make([]string, 0), zoneList[1], zoneList[1], zoneList[0]) + } + log.LogInfof("action[getSourceAndTargetZone],data partitionID:%v,zone name:[%v],chosen zoneList:%v", + partition.PartitionID, zoneName, zoneList) + case 3: + index := partition.PartitionID % 6 + switch partition.PartitionID%6 < 3 { + case true: + zoneList = append(make([]string, 0), zoneList[index], zoneList[index], zoneList[(index+1)%3]) + default: + zoneList = append(make([]string, 0), zoneList[(index+1)%3], zoneList[(index+1)%3], zoneList[index%3]) + } + log.LogInfof("action[getSourceAndTargetZone],data partitionID:%v,zone name:[%v],chosen zoneList:%v", + partition.PartitionID, zoneName, zoneList) + default: + err = fmt.Errorf("partition zone num must be 1, 2 or 3") + return + } + + if currentZoneList, err = partition.getZoneList(c); err != nil { + return + } + intersect := util.Intersect(zoneList, currentZoneList) + projectiveToZoneList := util.Projective(zoneList, intersect) + projectiveToCurZoneList := util.Projective(currentZoneList, intersect) + log.LogInfof("Current replica zoneList:%v, volume zoneName:%v ", currentZoneList, zoneList) + if len(projectiveToZoneList) == 0 || len(projectiveToCurZoneList) == 0 { + err = fmt.Errorf("action[getSourceAndTargetZone], Current replica zoneList:%v is consistent with the volume zoneName:%v, do not need to balance", currentZoneList, zoneList) + return + } + offlineZone = projectiveToCurZoneList[0] + targetZone = projectiveToZoneList[0] + return +} + +func (partition *DataPartition) getAddressByZoneName(c *Cluster, zone string) (addr string, err error) { + for _, host := range partition.Hosts { + var dataNode *DataNode + var z *Zone + if dataNode, err = c.dataNode(host); err != nil { + return + } + if z, err = c.t.getZoneByDataNode(dataNode); err != nil { + return + } + if zone == z.name { + addr = host + } + } + return +} + +func (partition *DataPartition) getZoneList(c *Cluster) (zoneList []string, err error) { + zoneList = make([]string, 0) + for _, host := range partition.Hosts { + var dataNode *DataNode + var zone *Zone + if dataNode, err = c.dataNode(host); err != nil { + return + } + if zone, err = c.t.getZoneByDataNode(dataNode); err != nil { + return + } + zoneList = append(zoneList, zone.name) + } + return +} + +func (partition *DataPartition) getDataZoneMap(c *Cluster) (curZonesMap map[string]uint8, err error) { + curZonesMap = make(map[string]uint8, 0) + for _, host := range partition.Hosts { + var dataNode *DataNode + var zone *Zone + if dataNode, err = c.dataNode(host); err != nil { + return + } + if zone, err = c.t.getZoneByDataNode(dataNode); err != nil { + return + } + if _, ok := curZonesMap[zone.name]; !ok { + curZonesMap[zone.name] = 1 + } else { + curZonesMap[zone.name] = curZonesMap[zone.name] + 1 + } + } + return +} diff --git a/master/data_partition_check.go b/master/data_partition_check.go index 5aaa20a78f..97a7311268 100644 --- a/master/data_partition_check.go +++ b/master/data_partition_check.go @@ -180,17 +180,17 @@ func (partition *DataPartition) checkDiskError(clusterID, leaderAddr string) { return } -func (partition *DataPartition) checkReplicationTask(clusterID string, dataPartitionSize uint64) (tasks []*proto.AdminTask) { +func (partition *DataPartition) checkReplicationTask(c *Cluster, dataPartitionSize uint64) { var msg string - tasks = make([]*proto.AdminTask, 0) if excessAddr, excessErr := partition.deleteIllegalReplica(); excessErr != nil { msg = fmt.Sprintf("action[%v], partitionID:%v Excess Replication"+ " On :%v Err:%v rocksDBRecords:%v", deleteIllegalReplicaErr, partition.PartitionID, excessAddr, excessErr.Error(), partition.Hosts) - Warn(clusterID, msg) - partition.Lock() - partition.removeReplicaByAddr(excessAddr) - partition.Unlock() + Warn(c.Name, msg) + dn, _ := c.dataNode(excessAddr) + if dn != nil { + c.deleteDataReplica(partition, dn, false) + } } if partition.Status == proto.ReadWrite { return @@ -199,7 +199,7 @@ func (partition *DataPartition) checkReplicationTask(clusterID string, dataParti msg = fmt.Sprintf("action[%v], partitionID:%v Lack Replication"+ " On :%v Err:%v Hosts:%v new task to create DataReplica", addMissingReplicaErr, partition.PartitionID, lackAddr, lackErr.Error(), partition.Hosts) - Warn(clusterID, msg) + Warn(c.Name, msg) } else { partition.setToNormal() } diff --git a/master/data_partition_test.go b/master/data_partition_test.go index e157d3a019..ab3c2b73e1 100644 --- a/master/data_partition_test.go +++ b/master/data_partition_test.go @@ -24,6 +24,8 @@ func TestDataPartition(t *testing.T) { getDataPartition(partition.PartitionID, t) loadDataPartitionTest(partition, t) decommissionDataPartition(partition, t) + partition2 := commonVol.dataPartitions.partitions[1] + delDataReplicaTest(partition2, t) } func createDataPartition(vol *Vol, count int, t *testing.T) { @@ -89,3 +91,32 @@ func loadDataPartitionTest(dp *DataPartition, t *testing.T) { dp.validateCRC(server.cluster.Name) dp.setToNormal() } +func delDataReplicaTest(dp *DataPartition, t *testing.T) { + t.Logf("dpID[%v],hosts[%v],replica length[%v]", dp.PartitionID, dp.Hosts, len(dp.Replicas)) + testAddr := mds9Addr + extraReplica := proto.DataReplica{ + Status: 2, + Addr: testAddr, + } + addDataServer(testAddr, testZone1) + dn, _ := server.cluster.dataNode(testAddr) + extraDataReplica := &DataReplica{ + DataReplica: extraReplica, + dataNode: dn, + } + dp.Replicas = append(dp.Replicas, extraDataReplica) + err := server.cluster.deleteDataReplica(dp, dn, false) + if err != nil { + t.Errorf("delete replica failed, err[%v]", err) + } + server.cluster.checkDataPartitions() + if len(dp.Replicas) != 3 { + t.Errorf("delete replica failed, expect replica length[%v], but is[%v]", 3, len(dp.Replicas)) + } + for _, r := range dp.Replicas { + if testAddr == r.Addr { + t.Errorf("delete replica [%v] failed", testAddr) + return + } + } +} diff --git a/master/disk_manager.go b/master/disk_manager.go index 9bb71ca471..e3fc313cde 100644 --- a/master/disk_manager.go +++ b/master/disk_manager.go @@ -16,8 +16,8 @@ package master import ( "fmt" - "github.com/chubaofs/chubaofs/util" "github.com/chubaofs/chubaofs/util/log" + "sync" "time" ) @@ -27,6 +27,7 @@ func (c *Cluster) scheduleToCheckDiskRecoveryProgress() { if c.partition != nil && c.partition.IsRaftLeader() { if c.vols != nil { c.checkDiskRecoveryProgress() + c.checkMigratedDataPartitionsRecoveryProgress() } } time.Sleep(time.Second * defaultIntervalToCheckDataPartition) @@ -42,7 +43,6 @@ func (c *Cluster) checkDiskRecoveryProgress() { "checkDiskRecoveryProgress occurred panic") } }() - var diff float64 c.BadDataPartitionIds.Range(func(key, value interface{}) bool { badDataPartitionIds := value.([]uint64) newBadDpIds := make([]uint64, 0) @@ -58,20 +58,19 @@ func (c *Cluster) checkDiskRecoveryProgress() { if len(partition.Replicas) == 0 || len(partition.Replicas) < int(vol.dpReplicaNum) { continue } - diff = partition.getMinus() - if diff < util.GB { + if partition.isDataCatchUp() { partition.isRecover = false partition.RLock() c.syncUpdateDataPartition(partition) partition.RUnlock() - Warn(c.Name, fmt.Sprintf("clusterID[%v],partitionID[%v] has recovered success", c.Name, partitionID)) + Warn(c.Name, fmt.Sprintf("action[checkDiskRecoveryProgress] clusterID[%v],partitionID[%v] has recovered success", c.Name, partitionID)) } else { newBadDpIds = append(newBadDpIds, partitionID) } } if len(newBadDpIds) == 0 { - Warn(c.Name, fmt.Sprintf("clusterID[%v],node:disk[%v] has recovered success", c.Name, key)) + Warn(c.Name, fmt.Sprintf("action[checkDiskRecoveryProgress] clusterID[%v],node:disk[%v] has recovered success", c.Name, key)) c.BadDataPartitionIds.Delete(key) } else { c.BadDataPartitionIds.Store(key, newBadDpIds) @@ -84,11 +83,25 @@ func (c *Cluster) checkDiskRecoveryProgress() { func (c *Cluster) decommissionDisk(dataNode *DataNode, badDiskPath string, badPartitions []*DataPartition) (err error) { msg := fmt.Sprintf("action[decommissionDisk], Node[%v] OffLine,disk[%v]", dataNode.Addr, badDiskPath) log.LogWarn(msg) - + var wg sync.WaitGroup + errChannel := make(chan error, len(badPartitions)) + defer func() { + close(errChannel) + }() for _, dp := range badPartitions { - if err = c.decommissionDataPartition(dataNode.Addr, dp, diskOfflineErr); err != nil { - return - } + wg.Add(1) + go func(dp *DataPartition) { + defer wg.Done() + if err1 := c.decommissionDataPartition(dataNode.Addr, dp, getTargetAddressForDataPartitionDecommission, diskOfflineErr, "", false); err != nil { + errChannel <- err1 + } + }(dp) + } + wg.Wait() + select { + case err = <-errChannel: + return + default: } msg = fmt.Sprintf("action[decommissionDisk],clusterID[%v] Node[%v] OffLine success", c.Name, dataNode.Addr) diff --git a/master/gapi_cluster.go b/master/gapi_cluster.go index fd8b29c37e..ee4a515488 100644 --- a/master/gapi_cluster.go +++ b/master/gapi_cluster.go @@ -218,7 +218,7 @@ func (m *ClusterService) decommissionDataNode(ctx context.Context, args struct { if err != nil { return nil, err } - if err := m.cluster.decommissionDataNode(node); err != nil { + if err := m.cluster.decommissionDataNode(node, "", false); err != nil { return nil, err } rstMsg := fmt.Sprintf("decommission data node [%v] successfully", args.OffLineAddr) @@ -236,7 +236,7 @@ func (m *ClusterService) decommissionMetaNode(ctx context.Context, args struct { if err != nil { return nil, err } - if err = m.cluster.decommissionMetaNode(metaNode); err != nil { + if err = m.cluster.decommissionMetaNode(metaNode, false); err != nil { return nil, err } log.LogInfof("decommissionMetaNode metaNode [%v] has offline successfully", args.OffLineAddr) @@ -270,7 +270,7 @@ func (m *ClusterService) decommissionMetaPartition(ctx context.Context, args str if err != nil { return nil, err } - if err := m.cluster.decommissionMetaPartition(args.NodeAddr, mp); err != nil { + if err := m.cluster.decommissionMetaPartition(args.NodeAddr, mp, getTargetAddressForMetaPartitionDecommission, false); err != nil { return nil, err } log.LogInfof(proto.AdminDecommissionMetaPartition+" partitionID :%v decommissionMetaPartition successfully", args.PartitionID) @@ -611,6 +611,8 @@ func (m *ClusterService) makeClusterView() *proto.ClusterView { LeaderAddr: m.cluster.leaderInfo.addr, DisableAutoAlloc: m.cluster.DisableAutoAllocate, MetaNodeThreshold: m.cluster.cfg.MetaNodeThreshold, + DpRecoverPool: m.cluster.cfg.DataPartitionsRecoverPoolSize, + MpRecoverPool: m.cluster.cfg.MetaPartitionsRecoverPoolSize, Applied: m.cluster.fsm.applied, MaxDataPartitionID: m.cluster.idAlloc.dataPartitionID, MaxMetaNodeID: m.cluster.idAlloc.commonID, diff --git a/master/gapi_volume.go b/master/gapi_volume.go index ee5ec613e9..fba9116698 100644 --- a/master/gapi_volume.go +++ b/master/gapi_volume.go @@ -65,8 +65,9 @@ func (s *VolumeService) registerObject(schema *schemabuilder.Schema) { FollowerRead: vol.FollowerRead, NeedToLowerReplica: vol.NeedToLowerReplica, Authenticate: vol.authenticate, - CrossZone: vol.crossZone, EnableToken: vol.enableToken, + CrossZone: vol.crossZone, + AutoRepair: vol.autoRepair, Tokens: vol.tokens, RwDpCnt: vol.dataPartitions.readableAndWritableCnt, MpCnt: len(vol.MetaPartitions), @@ -205,7 +206,7 @@ func (s *VolumeService) createVolume(ctx context.Context, args struct { return nil, fmt.Errorf("[%s] not has permission to create volume for [%s]", uid, args.Owner) } - vol, err := s.cluster.createVol(args.Name, args.Owner, args.ZoneName, args.Description, int(args.MpCount), int(args.DpReplicaNum), int(args.DataPartitionSize), int(args.Capacity), args.FollowerRead, args.Authenticate, args.CrossZone, args.EnableToken) + vol, err := s.cluster.createVol(args.Name, args.Owner, args.ZoneName, args.Description, int(args.MpCount), int(args.DpReplicaNum), int(args.DataPartitionSize), int(args.Capacity), args.FollowerRead, args.Authenticate, args.EnableToken, false) if err != nil { return nil, err } @@ -266,11 +267,11 @@ func (s *VolumeService) markDeleteVol(ctx context.Context, args struct { } func (s *VolumeService) updateVolume(ctx context.Context, args struct { - Name, AuthKey string - ZoneName, Description *string - Capacity, ReplicaNum *uint64 - EnableToken *bool - FollowerRead, Authenticate *bool + Name, AuthKey string + ZoneName, Description *string + Capacity, ReplicaNum *uint64 + EnableToken *bool + FollowerRead, Authenticate, AutoRepair *bool }) (*Vol, error) { uid, perm, err := permissions(ctx, ADMIN|USER) if err != nil { @@ -326,7 +327,15 @@ func (s *VolumeService) updateVolume(ctx context.Context, args struct { newArgs.description = *args.Description } - if err = s.cluster.updateVol(args.Name, args.AuthKey, newArgs); err != nil { + if args.AutoRepair == nil { + args.AutoRepair = &vol.autoRepair + } + + if args.Description == nil { + args.Description = &vol.description + } + + if err = s.cluster.updateVol(args.Name, args.AuthKey, *args.ZoneName, *args.Description, *args.Capacity, uint8(*args.ReplicaNum), *args.FollowerRead, *args.Authenticate, *args.EnableToken, *args.AutoRepair); err != nil { return nil, err } diff --git a/master/http_server.go b/master/http_server.go index 987dd4354d..3d8bdaf2f3 100644 --- a/master/http_server.go +++ b/master/http_server.go @@ -244,6 +244,9 @@ func (m *Server) registerAPIRoutes(router *mux.Router) { router.NewRoute().Methods(http.MethodGet, http.MethodPost). Path(proto.AdminGetNodeInfo). HandlerFunc(m.getNodeInfoHandler) + router.NewRoute().Methods(http.MethodGet, http.MethodPost). + Path(proto.AdminSetNodeState). + HandlerFunc(m.setNodeToOfflineState) // user management APIs router.NewRoute().Methods(http.MethodPost). diff --git a/master/meta_node.go b/master/meta_node.go index 1661285526..d185b46f85 100644 --- a/master/meta_node.go +++ b/master/meta_node.go @@ -40,8 +40,9 @@ type MetaNode struct { metaPartitionInfos []*proto.MetaPartitionReport MetaPartitionCount int NodeSetID uint64 - sync.RWMutex `graphql:"-"` + sync.RWMutex `graphql:"-"` ToBeOffline bool + ToBeMigrated bool PersistenceMetaPartitions []uint64 } @@ -89,7 +90,8 @@ func (metaNode *MetaNode) isWritable() (ok bool) { metaNode.RLock() defer metaNode.RUnlock() if metaNode.IsActive && metaNode.MaxMemAvailWeight > gConfig.metaNodeReservedMem && - !metaNode.reachesThreshold() && metaNode.MetaPartitionCount < defaultMaxMetaPartitionCountOnEachNode { + !metaNode.reachesThreshold() && metaNode.MetaPartitionCount < defaultMaxMetaPartitionCountOnEachNode && + metaNode.ToBeOffline == false && metaNode.ToBeMigrated == false { ok = true } return diff --git a/master/meta_partition.go b/master/meta_partition.go index d1da322491..8cc7e3adc9 100644 --- a/master/meta_partition.go +++ b/master/meta_partition.go @@ -15,6 +15,7 @@ package master import ( + "github.com/chubaofs/chubaofs/util" "sync" "fmt" @@ -57,8 +58,8 @@ type MetaPartition struct { volName string Hosts []string Peers []proto.Peer - OfflinePeerID uint64 MissNodes map[string]int64 + OfflinePeerID uint64 LoadResponse []*proto.MetaPartitionLoadResponse offlineMutex sync.RWMutex sync.RWMutex @@ -185,13 +186,15 @@ func (mp *MetaPartition) checkEnd(c *Cluster, maxPartitionID uint64) { log.LogWarnf("action[checkEnd] vol[%v] not exist", mp.volName) return } - mp.Lock() - defer mp.Unlock() + vol.createMpMutex.RLock() + defer vol.createMpMutex.RUnlock() curMaxPartitionID := vol.maxPartitionID() if mp.PartitionID != curMaxPartitionID { log.LogWarnf("action[checkEnd] partition[%v] not max partition[%v]", mp.PartitionID, curMaxPartitionID) return } + mp.Lock() + defer mp.Unlock() if _, err = mp.getMetaReplicaLeader(); err != nil { log.LogWarnf("action[checkEnd] partition[%v] no leader", mp.PartitionID) return @@ -299,12 +302,12 @@ func (mp *MetaPartition) checkReplicaNum(c *Cluster, volName string, replicaNum } } -func (mp *MetaPartition) removeIllegalReplica() (excessAddr string, t *proto.AdminTask, err error) { - mp.RLock() - defer mp.RUnlock() +func (mp *MetaPartition) removeIllegalReplica() (excessAddr string, err error) { + mp.Lock() + defer mp.Unlock() for _, mr := range mp.Replicas { if !contains(mp.Hosts, mr.Addr) { - t = mr.createTaskToDeleteReplica(mp.PartitionID) + excessAddr = mr.Addr err = proto.ErrIllegalMetaReplica break } @@ -362,9 +365,20 @@ func (mp *MetaPartition) canBeOffline(nodeAddr string, replicaNum int) (err erro } // Check if there is a replica missing or not. -func (mp *MetaPartition) hasMissingOneReplica(replicaNum int) (err error) { - hostNum := len(mp.Replicas) - if hostNum <= replicaNum-1 { +func (mp *MetaPartition) hasMissingOneReplica(offlineAddr string, replicaNum int) (err error) { + curHostCount := len(mp.Hosts) + for _, host := range mp.Hosts { + if host == offlineAddr { + curHostCount = curHostCount - 1 + } + } + curReplicaCount := len(mp.Replicas) + for _, r := range mp.Replicas { + if r.Addr == offlineAddr { + curReplicaCount = curReplicaCount - 1 + } + } + if curHostCount < replicaNum-1 || curReplicaCount < replicaNum-1 { log.LogError(fmt.Sprintf("action[%v],partitionID:%v,err:%v", "hasMissingOneReplica", mp.PartitionID, proto.ErrHasOneMissingReplica)) err = proto.ErrHasOneMissingReplica @@ -470,20 +484,22 @@ func (mp *MetaPartition) reportMissingReplicas(clusterID, leaderAddr string, sec } } -func (mp *MetaPartition) replicaCreationTasks(clusterID, volName string) (tasks []*proto.AdminTask) { +func (mp *MetaPartition) replicaCreationTasks(c *Cluster, volName string) { var msg string - tasks = make([]*proto.AdminTask, 0) - if addr, _, err := mp.removeIllegalReplica(); err != nil { + mp.offlineMutex.Lock() + defer mp.offlineMutex.Unlock() + if addr, err := mp.removeIllegalReplica(); err != nil { msg = fmt.Sprintf("action[%v],clusterID[%v] metaPartition:%v excess replication"+ " on :%v err:%v persistenceHosts:%v", - deleteIllegalReplicaErr, clusterID, mp.PartitionID, addr, err.Error(), mp.Hosts) + deleteIllegalReplicaErr, c.Name, mp.PartitionID, addr, err.Error(), mp.Hosts) log.LogWarn(msg) + c.deleteMetaReplica(mp, addr, true, false) } if addrs := mp.missingReplicaAddrs(); addrs != nil { msg = fmt.Sprintf("action[missingReplicaAddrs],clusterID[%v] metaPartition:%v lack replication"+ " on :%v Hosts:%v", - clusterID, mp.PartitionID, addrs, mp.Hosts) - Warn(clusterID, msg) + c.Name, mp.PartitionID, addrs, mp.Hosts) + Warn(c.Name, msg) } return @@ -670,6 +686,81 @@ func (mp *MetaPartition) getMinusOfMaxInodeID() (minus float64) { return } +func (mp *MetaPartition) getPercentMinusOfInodeCount() (minus float64) { + mp.RLock() + defer mp.RUnlock() + var sentry float64 + for index, replica := range mp.Replicas { + if index == 0 { + sentry = float64(replica.InodeCount) + continue + } + diff := math.Abs(float64(replica.InodeCount) - sentry) + if diff > minus { + minus = diff + } + } + minus = minus / sentry + return +} + +func (mp *MetaPartition) getMinusOfInodeCount() (minus float64) { + mp.RLock() + defer mp.RUnlock() + var sentry float64 + for index, replica := range mp.Replicas { + if index == 0 { + sentry = float64(replica.InodeCount) + continue + } + diff := math.Abs(float64(replica.InodeCount) - sentry) + if diff > minus { + minus = diff + } + } + return +} + +func (mp *MetaPartition) getMinusOfDentryCount() (minus float64) { + mp.RLock() + defer mp.RUnlock() + if len(mp.Replicas) == 0 { + return 1 + } + var sentry float64 + for index, replica := range mp.Replicas { + if index == 0 { + sentry = float64(replica.DentryCount) + continue + } + diff := math.Abs(float64(replica.DentryCount) - sentry) + if diff > minus { + minus = diff + } + } + return +} + +func (mp *MetaPartition) getMinusOfApplyID() (minus float64) { + mp.RLock() + defer mp.RUnlock() + if len(mp.LoadResponse) == 0 { + return 1 + } + var sentry float64 + for index, resp := range mp.LoadResponse { + if index == 0 { + sentry = float64(resp.ApplyID) + continue + } + diff := math.Abs(float64(resp.ApplyID) - sentry) + if diff > minus { + minus = diff + } + } + return +} + func (mp *MetaPartition) setMaxInodeID() { var maxUsed uint64 for _, r := range mp.Replicas { @@ -729,3 +820,250 @@ func (mp *MetaPartition) getLiveZones(offlineAddr string) (zones []string) { } return } + +func (mp *MetaPartition) isLatestReplica(addr string) (ok bool) { + hostsLen := len(mp.Hosts) + if hostsLen <= 1 { + return + } + latestAddr := mp.Hosts[hostsLen-1] + return latestAddr == addr +} +func (mp *MetaPartition) RepairZone(vol *Vol, c *Cluster) (err error) { + var ( + zoneList []string + isNeedRebalance bool + ) + mp.RLock() + defer mp.RUnlock() + var isValidZone bool + if isValidZone, err = c.isValidZone(vol.zoneName); err != nil { + return + } + if !isValidZone { + log.LogWarnf("action[RepairZone], vol[%v], zoneName[%v], mpReplicaNum[%v] can not be automatically repaired", vol.Name, vol.zoneName, vol.dpReplicaNum) + return + } + zoneList = strings.Split(vol.zoneName, ",") + if len(mp.Replicas) != int(vol.mpReplicaNum) { + log.LogWarnf("action[RepairZone], meta replica length[%v] not equal to mpReplicaNum[%v]", len(mp.Replicas), vol.mpReplicaNum) + return + } + if mp.IsRecover { + log.LogWarnf("action[RepairZone], meta partition[%v] is recovering", mp.PartitionID) + return + } + + var mpInRecover uint64 + mpInRecover = uint64(c.metaPartitionInRecovering()) + if int32(mpInRecover) > c.cfg.MetaPartitionsRecoverPoolSize { + log.LogWarnf("action[repairMetaPartition] clusterID[%v]Recover pool is full, recover partition[%v], pool size[%v]", c.Name, mpInRecover, c.cfg.MetaPartitionsRecoverPoolSize) + return + } + rps := mp.getLiveReplicas() + if len(rps) < int(vol.mpReplicaNum) { + log.LogWarnf("action[RepairZone], vol[%v], zoneName[%v], live Replicas [%v] less than mpReplicaNum[%v], can not be automatically repaired", vol.Name, vol.zoneName, len(rps), vol.mpReplicaNum) + return + } + + if isNeedRebalance, err = mp.needToRebalanceZone(c, zoneList); err != nil { + return + } + if !isNeedRebalance { + return + } + + if err = c.sendRepairMetaPartitionTask(mp, BalanceMetaZone); err != nil { + log.LogErrorf("action[RepairZone] clusterID[%v] vol[%v] meta partition[%v] err[%v]", c.Name, vol.Name, mp.PartitionID, err) + return + } + return +} + +var getTargetAddressForRepairMetaZone = func(c *Cluster, nodeAddr string, mp *MetaPartition, oldHosts []string, excludeNodeSets []uint64, zoneName string) (oldAddr, addAddr string, err error) { + var ( + offlineZoneName string + targetZoneName string + addrInTargetZone string + targetZone *Zone + nodesetInTargetZone *nodeSet + targetHosts []string + ) + if offlineZoneName, targetZoneName, err = mp.getOfflineAndTargetZone(c, zoneName); err != nil { + return + } + if offlineZoneName == "" || targetZoneName == "" { + return + } + if targetZone, err = c.t.getZone(targetZoneName); err != nil { + return + } + if oldAddr, err = mp.getAddressByZoneName(c, offlineZoneName); err != nil { + return + } + if oldAddr == "" { + err = fmt.Errorf("can not find address to decommission") + return + } + if err = c.validateDecommissionMetaPartition(mp, oldAddr); err != nil { + return + } + if addrInTargetZone, err = mp.getAddressByZoneName(c, targetZone.name); err != nil { + return + } + //if there is no replica in target zone, choose random nodeset in target zone + if addrInTargetZone == "" { + if targetHosts, _, err = targetZone.getAvailMetaNodeHosts(nil, mp.Hosts, 1); err != nil { + return + } + if len(targetHosts) == 0 { + err = fmt.Errorf("no available space to find a target address") + return + } + addAddr = targetHosts[0] + return + } + var targetNode *MetaNode + //if there is a replica in target zone, choose the same nodeset with this replica + if targetNode, err = c.metaNode(addrInTargetZone); err != nil { + err = fmt.Errorf("action[getTargetAddressForRepairMetaZone] partitionID[%v], addr[%v] metaNode not exist", mp.PartitionID, addrInTargetZone) + return + } + if nodesetInTargetZone, err = targetZone.getNodeSet(targetNode.NodeSetID); err != nil { + return + } + if targetHosts, _, err = nodesetInTargetZone.getAvailMetaNodeHosts(mp.Hosts, 1); err != nil { + // select meta nodes from the other node set in same zone + excludeNodeSets = append(excludeNodeSets, nodesetInTargetZone.ID) + if targetHosts, _, err = targetZone.getAvailMetaNodeHosts(excludeNodeSets, mp.Hosts, 1); err != nil { + return + } + } + if len(targetHosts) == 0 { + err = fmt.Errorf("no available space to find a target address") + return + } + addAddr = targetHosts[0] + log.LogInfof("action[getTargetAddressForRepairMetaZone],meta partitionID:%v,zone name:[%v],old address:[%v], new address:[%v]", + mp.PartitionID, zoneName, oldAddr, addAddr) + return +} + +//check if the meta partition needs to rebalance zone +func (mp *MetaPartition) needToRebalanceZone(c *Cluster, zoneList []string) (isNeed bool, err error) { + var curZoneMap map[string]uint8 + var curZoneList []string + curZoneMap = make(map[string]uint8, 0) + curZoneList = make([]string, 0) + if curZoneMap, err = mp.getMetaZoneMap(c); err != nil { + return + } + for k := range curZoneMap { + curZoneList = append(curZoneList, k) + } + + log.LogInfof("action[needToRebalanceZone],meta partitionID:%v,zone name:%v,current zones[%v]", + mp.PartitionID, zoneList, curZoneList) + if len(curZoneMap) == len(zoneList) { + isNeed = false + for _, zone := range zoneList { + if _, ok := curZoneMap[zone]; !ok { + isNeed = true + } + } + return + } + isNeed = true + return +} + +func (mp *MetaPartition) getOfflineAndTargetZone(c *Cluster, volZoneName string) (offlineZone, targetZone string, err error) { + zoneList := strings.Split(volZoneName, ",") + switch len(zoneList) { + case 1: + zoneList = append(make([]string, 0), zoneList[0], zoneList[0], zoneList[0]) + case 2: + switch mp.PartitionID % 2 { + case 0: + zoneList = append(make([]string, 0), zoneList[0], zoneList[0], zoneList[1]) + default: + zoneList = append(make([]string, 0), zoneList[1], zoneList[1], zoneList[0]) + } + log.LogInfof("action[getSourceAndTargetZone],data partitionID:%v,zone name:[%v],chosen zoneList:%v", + mp.PartitionID, volZoneName, zoneList) + case 3: + log.LogInfof("action[getSourceAndTargetZone],data partitionID:%v,zone name:[%v],chosen zoneList:%v", + mp.PartitionID, volZoneName, zoneList) + default: + err = fmt.Errorf("partition zone num must be 1, 2 or 3") + return + } + var currentZoneList []string + if currentZoneList, err = mp.getZoneList(c); err != nil { + return + } + intersect := util.Intersect(zoneList, currentZoneList) + projectiveToZoneList := util.Projective(zoneList, intersect) + projectiveToCurZoneList := util.Projective(currentZoneList, intersect) + log.LogInfof("Current replica zoneList:%v, volume zoneName:%v ", currentZoneList, zoneList) + if len(projectiveToZoneList) == 0 || len(projectiveToCurZoneList) == 0 { + err = fmt.Errorf("action[getSourceAndTargetZone], Current replica zoneList:%v is consistent with the volume zoneName:%v, do not need to balance ", currentZoneList, zoneList) + return + } + offlineZone = projectiveToCurZoneList[0] + targetZone = projectiveToZoneList[0] + return +} + +func (mp *MetaPartition) getAddressByZoneName(c *Cluster, zone string) (addr string, err error) { + for _, host := range mp.Hosts { + var metaNode *MetaNode + var z *Zone + if metaNode, err = c.metaNode(host); err != nil { + return + } + if z, err = c.t.getZoneByMetaNode(metaNode); err != nil { + return + } + if zone == z.name { + addr = host + } + } + return +} + +func (mp *MetaPartition) getZoneList(c *Cluster) (zoneList []string, err error) { + zoneList = make([]string, 0) + for _, host := range mp.Hosts { + var metaNode *MetaNode + var zone *Zone + if metaNode, err = c.metaNode(host); err != nil { + return + } + if zone, err = c.t.getZoneByMetaNode(metaNode); err != nil { + return + } + zoneList = append(zoneList, zone.name) + } + return +} + +func (mp *MetaPartition) getMetaZoneMap(c *Cluster) (curZonesMap map[string]uint8, err error) { + curZonesMap = make(map[string]uint8, 0) + for _, host := range mp.Hosts { + var metaNode *MetaNode + var zone *Zone + if metaNode, err = c.metaNode(host); err != nil { + return + } + if zone, err = c.t.getZoneByMetaNode(metaNode); err != nil { + return + } + if _, ok := curZonesMap[zone.name]; !ok { + curZonesMap[zone.name] = 1 + } else { + curZonesMap[zone.name] = curZonesMap[zone.name] + 1 + } + } + return +} diff --git a/master/meta_partition_manager.go b/master/meta_partition_manager.go index b98d6a2920..16560dc75c 100644 --- a/master/meta_partition_manager.go +++ b/master/meta_partition_manager.go @@ -134,9 +134,10 @@ func (c *Cluster) scheduleToCheckMetaPartitionRecoveryProgress() { if c.partition != nil && c.partition.IsRaftLeader() { if c.vols != nil { c.checkMetaPartitionRecoveryProgress() + c.checkMigratedMetaPartitionRecoveryProgress() } } - time.Sleep(time.Second * defaultIntervalToCheckDataPartition) + time.Sleep(3 * time.Second * defaultIntervalToCheckDataPartition) } }() } @@ -179,7 +180,7 @@ func (c *Cluster) checkMetaPartitionRecoveryProgress() { } if len(newBadMpIds) == 0 { - Warn(c.Name, fmt.Sprintf("clusterID[%v],node[%v] has recovered success", c.Name, key)) + Warn(c.Name, fmt.Sprintf("action[checkMetaPartitionRecoveryProgress] clusterID[%v],node[%v] has recovered success", c.Name, key)) c.BadMetaPartitionIds.Delete(key) } else { c.BadMetaPartitionIds.Store(key, newBadMpIds) diff --git a/master/metadata_fsm.go b/master/metadata_fsm.go index d42771ef85..90ea2ba32c 100644 --- a/master/metadata_fsm.go +++ b/master/metadata_fsm.go @@ -77,7 +77,9 @@ func (mf *MetadataFsm) restore() { } func (mf *MetadataFsm) restoreApplied() { - + defer func() { + log.LogInfof("action[restoreApplied],applyID[%v]", mf.applied) + }() value, err := mf.store.Get(applied) if err != nil { panic(fmt.Sprintf("Failed to restore applied err:%v", err.Error())) diff --git a/master/metadata_fsm_op.go b/master/metadata_fsm_op.go index 9506ff1b49..2cc2224dd8 100644 --- a/master/metadata_fsm_op.go +++ b/master/metadata_fsm_op.go @@ -32,24 +32,28 @@ import ( transferred over the network. */ type clusterValue struct { - Name string - Threshold float32 - DisableAutoAllocate bool - DataNodeDeleteLimitRate uint64 - MetaNodeDeleteBatchCount uint64 - MetaNodeDeleteWorkerSleepMs uint64 DataNodeAutoRepairLimitRate uint64 + Name string + Threshold float32 + DisableAutoAllocate bool + DataNodeDeleteLimitRate uint64 + MetaNodeDeleteBatchCount uint64 + MetaNodeDeleteWorkerSleepMs uint64 + PoolSizeOfDataPartitionsInRecover int32 + PoolSizeOfMetaPartitionsInRecover int32 } func newClusterValue(c *Cluster) (cv *clusterValue) { cv = &clusterValue{ - Name: c.Name, - Threshold: c.cfg.MetaNodeThreshold, - DataNodeDeleteLimitRate: c.cfg.DataNodeDeleteLimitRate, - MetaNodeDeleteBatchCount: c.cfg.MetaNodeDeleteBatchCount, - MetaNodeDeleteWorkerSleepMs: c.cfg.MetaNodeDeleteWorkerSleepMs, DataNodeAutoRepairLimitRate: c.cfg.DataNodeAutoRepairLimitRate, - DisableAutoAllocate: c.DisableAutoAllocate, + Name: c.Name, + Threshold: c.cfg.MetaNodeThreshold, + DataNodeDeleteLimitRate: c.cfg.DataNodeDeleteLimitRate, + MetaNodeDeleteBatchCount: c.cfg.MetaNodeDeleteBatchCount, + MetaNodeDeleteWorkerSleepMs: c.cfg.MetaNodeDeleteWorkerSleepMs, + DisableAutoAllocate: c.DisableAutoAllocate, + PoolSizeOfDataPartitionsInRecover: c.cfg.DataPartitionsRecoverPoolSize, + PoolSizeOfMetaPartitionsInRecover: c.cfg.MetaPartitionsRecoverPoolSize, } return cv } @@ -134,8 +138,9 @@ type volValue struct { Owner string FollowerRead bool Authenticate bool - CrossZone bool EnableToken bool + CrossZone bool + AutoRepair bool ZoneName string OSSAccessKey string OSSSecretKey string @@ -162,8 +167,9 @@ func newVolValue(vol *Vol) (vv *volValue) { Owner: vol.Owner, FollowerRead: vol.FollowerRead, Authenticate: vol.authenticate, - CrossZone: vol.crossZone, + AutoRepair: vol.autoRepair, ZoneName: vol.zoneName, + CrossZone: vol.crossZone, EnableToken: vol.enableToken, OSSAccessKey: vol.OSSAccessKey, OSSSecretKey: vol.OSSSecretKey, @@ -527,7 +533,16 @@ func (c *Cluster) updateMetaNodeDeleteBatchCount(val uint64) { func (c *Cluster) updateMetaNodeDeleteWorkerSleepMs(val uint64) { atomic.StoreUint64(&c.cfg.MetaNodeDeleteWorkerSleepMs, val) } - +func (c *Cluster) updateRecoverPoolSize(dpPoolSize, mpPoolSize int32) { + if dpPoolSize == 0 { + dpPoolSize = defaultRecoverPoolSize + } + if mpPoolSize == 0 { + mpPoolSize = defaultRecoverPoolSize + } + atomic.StoreInt32(&c.cfg.DataPartitionsRecoverPoolSize, dpPoolSize) + atomic.StoreInt32(&c.cfg.MetaPartitionsRecoverPoolSize, mpPoolSize) +} func (c *Cluster) updateDataNodeAutoRepairLimit(val uint64) { atomic.StoreUint64(&c.cfg.DataNodeAutoRepairLimitRate, val) } @@ -549,11 +564,13 @@ func (c *Cluster) loadClusterValue() (err error) { return err } c.cfg.MetaNodeThreshold = cv.Threshold + c.cfg.nodeSetCapacity = defaultNodeSetCapacity c.DisableAutoAllocate = cv.DisableAutoAllocate c.updateMetaNodeDeleteBatchCount(cv.MetaNodeDeleteBatchCount) c.updateMetaNodeDeleteWorkerSleepMs(cv.MetaNodeDeleteWorkerSleepMs) c.updateDataNodeDeleteLimitRate(cv.DataNodeDeleteLimitRate) c.updateDataNodeAutoRepairLimit(cv.DataNodeAutoRepairLimitRate) + c.updateRecoverPoolSize(cv.PoolSizeOfDataPartitionsInRecover, cv.PoolSizeOfMetaPartitionsInRecover) log.LogInfof("action[loadClusterValue], metaNodeThreshold[%v]", cv.Threshold) } return @@ -643,7 +660,7 @@ func (c *Cluster) loadMetaNodes() (err error) { } } c.metaNodes.Store(metaNode.Addr, metaNode) - log.LogInfof("action[loadMetaNodes],metaNode[%v], metaNodeID[%v],zone[%v],ns[%v]", metaNode.Addr, metaNode.ID, mnv.ZoneName, mnv.NodeSetID) + log.LogInfof("action[loadMetaNodes],metaNode[%v],id[%v],zone[%v],ns[%v]", metaNode.Addr, mnv.ID, mnv.ZoneName, mnv.NodeSetID) } return } @@ -660,10 +677,13 @@ func (c *Cluster) loadVols() (err error) { err = fmt.Errorf("action[loadVols],value:%v,unmarshal err:%v", string(value), err) return err } + if !vv.CrossZone && vv.ZoneName == "" { + vv.ZoneName = DefaultZoneName + } vol := newVolFromVolValue(vv) vol.Status = vv.Status c.putVol(vol) - log.LogInfof("action[loadVols],vol[%v]", vol.Name) + log.LogInfof("action[loadVols],vol[%v],id[%v],status[%v]", vol.Name, vv.ID, vv.Status) } return } @@ -701,6 +721,9 @@ func (c *Cluster) loadMetaPartitions() (err error) { mp.setPeers(mpv.Peers) mp.OfflinePeerID = mpv.OfflinePeerID mp.IsRecover = mpv.IsRecover + if mp.IsRecover { + c.putMigratedMetaPartitions("history", mp.PartitionID) + } vol.addMetaPartition(mp) log.LogInfof("action[loadMetaPartitions],vol[%v],mp[%v]", vol.Name, mp.PartitionID) } @@ -746,6 +769,9 @@ func (c *Cluster) loadDataPartitions() (err error) { } dp.afterCreation(rv.Addr, rv.DiskPath, c) } + if dp.isRecover { + c.putMigratedDataPartitionIDs(nil, "history", dp.PartitionID) + } vol.dataPartitions.put(dp) log.LogInfof("action[loadDataPartitions],vol[%v],dp[%v]", vol.Name, dp.PartitionID) } diff --git a/master/migration.go b/master/migration.go new file mode 100644 index 0000000000..635d51919c --- /dev/null +++ b/master/migration.go @@ -0,0 +1,142 @@ +package master + +import ( + "fmt" + "github.com/chubaofs/chubaofs/util/log" +) + +func (c *Cluster) checkMigratedDataPartitionsRecoveryProgress() { + defer func() { + if r := recover(); r != nil { + log.LogWarnf("checkMigratedDataPartitionsRecoveryProgress occurred panic,err[%v]", r) + WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName), + "checkMigratedDataPartitionsRecoveryProgress occurred panic") + } + }() + + c.MigratedDataPartitionIds.Range(func(key, value interface{}) bool { + badDataPartitionIds := value.([]uint64) + newBadDpIds := make([]uint64, 0) + for _, partitionID := range badDataPartitionIds { + partition, err := c.getDataPartitionByID(partitionID) + if err != nil { + continue + } + vol, err := c.getVol(partition.VolName) + if err != nil { + continue + } + if len(partition.Replicas) == 0 || len(partition.Replicas) < int(vol.dpReplicaNum) { + continue + } + if partition.isDataCatchUpInStrictMode() { + partition.isRecover = false + partition.RLock() + c.syncUpdateDataPartition(partition) + partition.RUnlock() + } else { + newBadDpIds = append(newBadDpIds, partitionID) + } + } + + if len(newBadDpIds) == 0 { + Warn(c.Name, fmt.Sprintf("action[checkMigratedDpRecoveryProgress] clusterID[%v],node:disk[%v] has recovered success", c.Name, key)) + c.MigratedDataPartitionIds.Delete(key) + } else { + c.MigratedDataPartitionIds.Store(key, newBadDpIds) + } + + return true + }) +} + +func (c *Cluster) putMigratedDataPartitionIDs(replica *DataReplica, addr string, partitionID uint64) { + var key string + newMigratedPartitionIDs := make([]uint64, 0) + if replica != nil { + key = fmt.Sprintf("%s:%s", addr, replica.DiskPath) + } else { + key = fmt.Sprintf("%s:%s", addr, "") + } + migratedPartitionIDs, ok := c.MigratedDataPartitionIds.Load(key) + if ok { + newMigratedPartitionIDs = migratedPartitionIDs.([]uint64) + } + newMigratedPartitionIDs = append(newMigratedPartitionIDs, partitionID) + c.MigratedDataPartitionIds.Store(key, newMigratedPartitionIDs) +} + +func (c *Cluster) putMigratedMetaPartitions(addr string, partitionID uint64) { + newMigratedPartitionIDs := make([]uint64, 0) + migratedPartitionIDs, ok := c.MigratedMetaPartitionIds.Load(addr) + if ok { + newMigratedPartitionIDs = migratedPartitionIDs.([]uint64) + } + newMigratedPartitionIDs = append(newMigratedPartitionIDs, partitionID) + c.MigratedMetaPartitionIds.Store(addr, newMigratedPartitionIDs) +} + +func (c *Cluster) checkMigratedMetaPartitionRecoveryProgress() { + defer func() { + if r := recover(); r != nil { + log.LogWarnf("checkMigratedMetaPartitionRecoveryProgress occurred panic,err[%v]", r) + WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName), + "checkMigratedMetaPartitionRecoveryProgress occurred panic") + } + }() + + c.MigratedMetaPartitionIds.Range(func(key, value interface{}) bool { + badMetaPartitionIds := value.([]uint64) + for _, partitionID := range badMetaPartitionIds { + partition, err := c.getMetaPartitionByID(partitionID) + if err != nil { + continue + } + c.doLoadMetaPartition(partition) + } + return true + }) + + var ( + dentryDiff float64 + applyIDDiff float64 + ) + c.MigratedMetaPartitionIds.Range(func(key, value interface{}) bool { + badMetaPartitionIds := value.([]uint64) + newBadMpIds := make([]uint64, 0) + for _, partitionID := range badMetaPartitionIds { + partition, err := c.getMetaPartitionByID(partitionID) + if err != nil { + continue + } + vol, err := c.getVol(partition.volName) + if err != nil { + continue + } + if len(partition.Replicas) == 0 || len(partition.Replicas) < int(vol.mpReplicaNum) { + continue + } + dentryDiff = partition.getMinusOfDentryCount() + //inodeDiff = partition.getMinusOfInodeCount() + //inodeDiff = partition.getPercentMinusOfInodeCount() + applyIDDiff = partition.getMinusOfApplyID() + if dentryDiff == 0 && applyIDDiff == 0 { + partition.IsRecover = false + partition.RLock() + c.syncUpdateMetaPartition(partition) + partition.RUnlock() + } else { + newBadMpIds = append(newBadMpIds, partitionID) + } + } + + if len(newBadMpIds) == 0 { + Warn(c.Name, fmt.Sprintf("action[checkMigratedMpRecoveryProgress] clusterID[%v],node[%v] has recovered success", c.Name, key)) + c.MigratedMetaPartitionIds.Delete(key) + } else { + c.MigratedMetaPartitionIds.Store(key, newBadMpIds) + } + + return true + }) +} diff --git a/master/mocktest/data_server.go b/master/mocktest/data_server.go index 57dc96c161..4da69b740c 100644 --- a/master/mocktest/data_server.go +++ b/master/mocktest/data_server.go @@ -45,6 +45,7 @@ type MockDataServer struct { partitions []*MockDataPartition zoneName string mc *master.MasterClient + stopC chan bool } func NewMockDataServer(addr string, zoneName string) *MockDataServer { @@ -53,6 +54,7 @@ func NewMockDataServer(addr string, zoneName string) *MockDataServer { zoneName: zoneName, partitions: make([]*MockDataPartition, 0), mc: master.NewMasterClient([]string{hostAddr}, false), + stopC: make(chan bool), } return mds @@ -63,6 +65,10 @@ func (mds *MockDataServer) Start() { go mds.start() } +func (mds *MockDataServer) Stop() { + close(mds.stopC) +} + func (mds *MockDataServer) register() { var err error var nodeID uint64 @@ -86,6 +92,16 @@ func (mds *MockDataServer) start() { if err != nil { panic(err) } + defer listener.Close() + go func() { + for { + select { + case <-mds.stopC: + return + default: + } + } + }() for { conn, err := listener.Accept() if err != nil { diff --git a/master/mocktest/meta_server.go b/master/mocktest/meta_server.go index 514b2a2122..57486556e9 100644 --- a/master/mocktest/meta_server.go +++ b/master/mocktest/meta_server.go @@ -35,6 +35,7 @@ type MockMetaServer struct { mc *master.MasterClient partitions map[uint64]*MockMetaPartition // Key: metaRangeId, Val: metaPartition sync.RWMutex + stopC chan bool } func NewMockMetaServer(addr string, zoneName string) *MockMetaServer { @@ -42,6 +43,7 @@ func NewMockMetaServer(addr string, zoneName string) *MockMetaServer { TcpAddr: addr, partitions: make(map[uint64]*MockMetaPartition, 0), ZoneName: zoneName, mc: master.NewMasterClient([]string{hostAddr}, false), + stopC: make(chan bool), } return mms } @@ -51,6 +53,10 @@ func (mms *MockMetaServer) Start() { go mms.start() } +func (mms *MockMetaServer) Stop() { + close(mms.stopC) +} + func (mms *MockMetaServer) register() { var err error var nodeID uint64 @@ -75,6 +81,16 @@ func (mms *MockMetaServer) start() { if err != nil { panic(err) } + defer listener.Close() + go func() { + for { + select { + case <-mms.stopC: + return + default: + } + } + }() for { conn, err := listener.Accept() if err != nil { diff --git a/master/server.go b/master/server.go index e6eabca631..9af4860e45 100644 --- a/master/server.go +++ b/master/server.go @@ -117,6 +117,7 @@ func (m *Server) Start(cfg *config.Config) (err error) { if m.cluster.MasterSecretKey, err = cryptoutil.Base64Decode(MasterSecretKey); err != nil { return fmt.Errorf("action[Start] failed %v, err: master service Key invalid = %s", proto.ErrInvalidCfg, MasterSecretKey) } + m.cluster.scheduleTask() m.startHTTPService(ModuleName, cfg) exporter.RegistConsul(m.clusterName, ModuleName, cfg) diff --git a/master/topology.go b/master/topology.go index 0ef8c379e9..12c16967a2 100644 --- a/master/topology.go +++ b/master/topology.go @@ -20,6 +20,7 @@ import ( "github.com/chubaofs/chubaofs/util/errors" "github.com/chubaofs/chubaofs/util/log" "sort" + "strings" "sync" ) @@ -84,6 +85,9 @@ func (t *topology) putZoneIfAbsent(zone *Zone) (beStoredZone *Zone) { } func (t *topology) getZone(name string) (zone *Zone, err error) { + if name == "" { + return nil, fmt.Errorf("zone name is empty") + } t.zoneMap.Range(func(zoneName, value interface{}) bool { if zoneName != name { return true @@ -134,6 +138,15 @@ func (t *topology) getZoneByDataNode(dataNode *DataNode) (zone *Zone, err error) return t.getZone(dataNode.ZoneName) } +func (t *topology) getZoneByMetaNode(metaNode *MetaNode) (zone *Zone, err error) { + _, ok := t.metaNodes.Load(metaNode.Addr) + if !ok { + return nil, errors.Trace(metaNodeNotFound(metaNode.Addr), "%v not found", metaNode.Addr) + } + + return t.getZone(metaNode.ZoneName) +} + func (t *topology) putMetaNode(metaNode *MetaNode) (err error) { if _, ok := t.metaNodes.Load(metaNode.Addr); ok { return @@ -222,10 +235,15 @@ func (ns *nodeSet) deleteMetaNode(metaNode *MetaNode) { ns.metaNodes.Delete(metaNode.Addr) } -func (ns *nodeSet) canWriteForDataNode(replicaNum int) bool { +// can Write For DataNode With Exclude Hosts +func (ns *nodeSet) canWriteForDataNode(excludeHosts []string, replicaNum int) bool { var count int ns.dataNodes.Range(func(key, value interface{}) bool { node := value.(*DataNode) + if contains(excludeHosts, node.Addr) == true { + log.LogDebugf("contains return") + return true + } if node.isWriteAble() { count++ } @@ -306,64 +324,78 @@ func calculateDemandWriteNodes(zoneNum, replicaNum int) (demandWriteNodes int) { return } -func (t *topology) allocZonesForMetaNode(zoneNum, replicaNum int, excludeZone []string) (zones []*Zone, err error) { - zones = t.getAllZones() +func (t *topology) allocZonesForMetaNode(zoneName string, replicaNum int, excludeZone []string) (candidateZones []*Zone, err error) { + var initCandidateZones []*Zone + initCandidateZones = make([]*Zone, 0) + zoneList := strings.Split(zoneName, ",") if t.isSingleZone() { - return zones, nil + return t.getAllZones(), nil } if excludeZone == nil { excludeZone = make([]string, 0) } - candidateZones := make([]*Zone, 0) - demandWriteNodes := calculateDemandWriteNodes(zoneNum, replicaNum) - for i := 0; i < len(zones); i++ { - if t.zoneIndexForMetaNode >= len(zones) { - t.zoneIndexForMetaNode = 0 + for _, z := range zoneList { + var zone *Zone + if zone, err = t.getZone(z); err != nil { + return } - zone := t.getZoneByIndex(t.zoneIndexForMetaNode) - t.zoneIndexForMetaNode++ + initCandidateZones = append(initCandidateZones, zone) + } + demandWriteNodes := calculateDemandWriteNodes(len(zoneList), replicaNum) + candidateZones = make([]*Zone, 0) + for _, zone := range initCandidateZones { if zone.status == unavailableZone { continue } - if contains(excludeZone, zone.name) { - continue - } if zone.canWriteForMetaNode(uint8(demandWriteNodes)) { candidateZones = append(candidateZones, zone) } - if len(candidateZones) >= zoneNum { + if len(candidateZones) >= len(zoneList) { break } } + //if there is no space in the zone for single zone partition, randomly choose another zone + if len(candidateZones) < 1 && len(zoneList) == 1 { + initCandidateZones = t.getAllZones() + for _, zone := range initCandidateZones { + if zone.status == unavailableZone { + continue + } + if zone.canWriteForDataNode(uint8(demandWriteNodes)) { + candidateZones = append(candidateZones, zone) + } + } + } //if across zone,candidateZones must be larger than or equal with 2,otherwise,must have a candidate zone - if (zoneNum >= 2 && len(candidateZones) < 2) || len(candidateZones) < 1 { + if (replicaNum == 3 && len(zoneList) >= 2 && len(candidateZones) < 2) || len(candidateZones) < 1 { log.LogError(fmt.Sprintf("action[allocZonesForMetaNode],reqZoneNum[%v],candidateZones[%v],demandWriteNodes[%v],err:%v", - zoneNum, len(candidateZones), demandWriteNodes, proto.ErrNoZoneToCreateMetaPartition)) + len(zoneList), len(candidateZones), demandWriteNodes, proto.ErrNoZoneToCreateMetaPartition)) return nil, proto.ErrNoZoneToCreateMetaPartition } - zones = candidateZones err = nil return } -func (t *topology) allocZonesForDataNode(zoneNum, replicaNum int, excludeZone []string) (zones []*Zone, err error) { - zones = t.getAllZones() - log.LogInfof("len(zones) = %v \n", len(zones)) + +//allocate zones according to the specified zoneName and replicaNum +func (t *topology) allocZonesForDataNode(zoneName string, replicaNum int, excludeZone []string) (candidateZones []*Zone, err error) { + var initCandidateZones []*Zone + initCandidateZones = make([]*Zone, 0) + zoneList := strings.Split(zoneName, ",") if t.isSingleZone() { - return zones, nil + return t.getAllZones(), nil } - if excludeZone == nil { - excludeZone = make([]string, 0) - } - demandWriteNodes := calculateDemandWriteNodes(zoneNum, replicaNum) - candidateZones := make([]*Zone, 0) - for i := 0; i < len(zones); i++ { - if t.zoneIndexForDataNode >= len(zones) { - t.zoneIndexForDataNode = 0 + for _, z := range zoneList { + var zone *Zone + if zone, err = t.getZone(z); err != nil { + return } - zone := t.getZoneByIndex(t.zoneIndexForDataNode) - t.zoneIndexForDataNode++ + initCandidateZones = append(initCandidateZones, zone) + } + demandWriteNodes := calculateDemandWriteNodes(len(zoneList), replicaNum) + candidateZones = make([]*Zone, 0) + for _, zone := range initCandidateZones { if zone.status == unavailableZone { continue } @@ -373,17 +405,31 @@ func (t *topology) allocZonesForDataNode(zoneNum, replicaNum int, excludeZone [] if zone.canWriteForDataNode(uint8(demandWriteNodes)) { candidateZones = append(candidateZones, zone) } - if len(candidateZones) >= zoneNum { + if len(candidateZones) >= len(zoneList) { break } } - //if across zone,candidateZones must be larger than or equal with 2,otherwise,must have one candidate zone - if (zoneNum >= 2 && len(candidateZones) < 2) || len(candidateZones) < 1 { + //if there is no space in the zone for single zone partition, randomly choose a zone from all zones + if len(candidateZones) < 1 && len(zoneList) == 1 { + initCandidateZones = t.getAllZones() + for _, zone := range initCandidateZones { + if zone.status == unavailableZone { + continue + } + if contains(excludeZone, zone.name) { + continue + } + if zone.canWriteForDataNode(uint8(demandWriteNodes)) { + candidateZones = append(candidateZones, zone) + } + } + } + //if across zone,candidateZones must be larger than or equal with 2, if not across zone, must have one candidate zone + if (replicaNum == 3 && len(zoneList) >= 2 && len(candidateZones) < 2) || len(candidateZones) < 1 { log.LogError(fmt.Sprintf("action[allocZonesForDataNode],reqZoneNum[%v],candidateZones[%v],demandWriteNodes[%v],err:%v", - zoneNum, len(candidateZones), demandWriteNodes, proto.ErrNoZoneToCreateDataPartition)) + len(zoneList), len(candidateZones), demandWriteNodes, proto.ErrNoZoneToCreateDataPartition)) return nil, errors.NewError(proto.ErrNoZoneToCreateDataPartition) } - zones = candidateZones err = nil return } @@ -564,7 +610,7 @@ func (zone *Zone) deleteMetaNode(metaNode *MetaNode) (err error) { return } -func (zone *Zone) allocNodeSetForDataNode(excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) { +func (zone *Zone) allocNodeSetForDataNode(excludeNodeSets []uint64, excludeHosts []string, replicaNum uint8) (ns *nodeSet, err error) { nset := zone.getAllNodeSet() if nset == nil { return nil, errors.NewError(proto.ErrNoNodeSetToCreateDataPartition) @@ -580,7 +626,7 @@ func (zone *Zone) allocNodeSetForDataNode(excludeNodeSets []uint64, replicaNum u if containsID(excludeNodeSets, ns.ID) { continue } - if ns.canWriteForDataNode(int(replicaNum)) { + if ns.canWriteForDataNode(excludeHosts, int(replicaNum)) { return } } @@ -666,7 +712,7 @@ func (zone *Zone) getAvailDataNodeHosts(excludeNodeSets []uint64, excludeHosts [ if replicaNum == 0 { return } - ns, err := zone.allocNodeSetForDataNode(excludeNodeSets, uint8(replicaNum)) + ns, err := zone.allocNodeSetForDataNode(excludeNodeSets, excludeHosts, uint8(replicaNum)) if err != nil { return nil, nil, errors.Trace(err, "zone[%v] alloc node set,replicaNum[%v]", zone.name, replicaNum) } diff --git a/master/topology_test.go b/master/topology_test.go index 65dd844146..ffc3a17af7 100644 --- a/master/topology_test.go +++ b/master/topology_test.go @@ -41,7 +41,7 @@ func TestSingleZone(t *testing.T) { //single zone exclude,if it is a single zone excludeZones don't take effect excludeZones := make([]string, 0) excludeZones = append(excludeZones, zoneName) - zones, err := topo.allocZonesForDataNode(replicaNum, replicaNum, excludeZones) + zones, err := topo.allocZonesForDataNode(zoneName, replicaNum, excludeZones) if err != nil { t.Error(err) return @@ -52,7 +52,7 @@ func TestSingleZone(t *testing.T) { } //single zone normal - zones, err = topo.allocZonesForDataNode(replicaNum, replicaNum, nil) + zones, err = topo.allocZonesForDataNode(zoneName, replicaNum, nil) if err != nil { t.Error(err) return @@ -63,6 +63,15 @@ func TestSingleZone(t *testing.T) { return } fmt.Println(newHosts) + + // single zone with exclude hosts + excludeHosts := []string{mds1Addr, mds2Addr, mds3Addr} + newHosts, _, err = zones[0].getAvailDataNodeHosts(nil, excludeHosts, replicaNum) + if err != nil { + t.Error(err) + return + } + fmt.Println(newHosts) topo.deleteDataNode(createDataNodeForTopo(mds1Addr, zoneName, nodeSet)) } @@ -98,7 +107,7 @@ func TestAllocZones(t *testing.T) { } //only pass replica num replicaNum := 2 - zones, err := topo.allocZonesForDataNode(replicaNum, replicaNum, nil) + zones, err := topo.allocZonesForDataNode(zoneName3, replicaNum, nil) if err != nil { t.Error(err) return @@ -110,14 +119,17 @@ func TestAllocZones(t *testing.T) { cluster := new(Cluster) cluster.t = topo cluster.cfg = newClusterConfig() + cluster.cfg.DataPartitionsRecoverPoolSize = maxDataPartitionsRecoverPoolSize + cluster.cfg.MetaPartitionsRecoverPoolSize = maxMetaPartitionsRecoverPoolSize + //don't cross zone - hosts, _, err := cluster.chooseTargetDataNodes("", nil, nil, replicaNum, 1, "") + hosts, _, err := cluster.chooseTargetDataNodes("", nil, nil, replicaNum, "zone1") if err != nil { t.Error(err) return } //cross zone - hosts, _, err = cluster.chooseTargetDataNodes("", nil, nil, replicaNum, 2, "") + hosts, _, err = cluster.chooseTargetDataNodes("", nil, nil, replicaNum, "zone1,zone2,zone3") if err != nil { t.Error(err) return @@ -126,7 +138,7 @@ func TestAllocZones(t *testing.T) { // after excluding zone3, alloc zones will be success excludeZones := make([]string, 0) excludeZones = append(excludeZones, zoneName3) - zones, err = topo.allocZonesForDataNode(2, replicaNum, excludeZones) + zones, err = topo.allocZonesForDataNode(zoneName3, replicaNum, excludeZones) if err != nil { t.Logf("allocZonesForDataNode failed,err[%v]", err) } diff --git a/master/vol.go b/master/vol.go index 912393caf2..b8e818e239 100644 --- a/master/vol.go +++ b/master/vol.go @@ -17,6 +17,7 @@ package master import ( "encoding/json" "fmt" + "strings" "sync" "github.com/chubaofs/chubaofs/proto" @@ -53,8 +54,9 @@ type Vol struct { NeedToLowerReplica bool FollowerRead bool authenticate bool - crossZone bool + autoRepair bool zoneName string + crossZone bool enableToken bool tokens map[string]*proto.Token tokensLock sync.RWMutex @@ -72,7 +74,7 @@ type Vol struct { sync.RWMutex } -func newVol(id uint64, name, owner, zoneName string, dpSize, capacity uint64, dpReplicaNum, mpReplicaNum uint8, followerRead, authenticate, crossZone bool, enableToken bool, createTime int64, description string) (vol *Vol) { +func newVol(id uint64, name, owner, zoneName string, dpSize, capacity uint64, dpReplicaNum, mpReplicaNum uint8, followerRead, authenticate, enableToken, autoRepair bool, createTime int64, description string) (vol *Vol) { vol = &Vol{ID: id, Name: name, MetaPartitions: make(map[uint64]*MetaPartition, 0)} vol.dataPartitions = newDataPartitionMap(name) if dpReplicaNum < defaultReplicaNum { @@ -91,16 +93,20 @@ func newVol(id uint64, name, owner, zoneName string, dpSize, capacity uint64, dp if dpSize < util.GB { dpSize = util.DefaultDataPartitionSize } + zoneList := strings.Split(zoneName, ",") + if len(zoneList) > 1 { + vol.crossZone = true + } vol.dataPartitionSize = dpSize vol.Capacity = capacity vol.FollowerRead = followerRead vol.authenticate = authenticate - vol.crossZone = crossZone vol.zoneName = zoneName vol.viewCache = make([]byte, 0) vol.mpsCache = make([]byte, 0) vol.createTime = createTime vol.enableToken = enableToken + vol.autoRepair = autoRepair vol.tokens = make(map[string]*proto.Token, 0) vol.description = description return @@ -118,8 +124,8 @@ func newVolFromVolValue(vv *volValue) (vol *Vol) { vv.ReplicaNum, vv.FollowerRead, vv.Authenticate, - vv.CrossZone, vv.EnableToken, + vv.AutoRepair, vv.CreateTime, vv.Description) // overwrite oss secure @@ -127,6 +133,8 @@ func newVolFromVolValue(vv *volValue) (vol *Vol) { vol.Status = vv.Status vol.dpSelectorName = vv.DpSelectorName vol.dpSelectorParm = vv.DpSelectorParm + vol.crossZone = vv.CrossZone + return vol } @@ -253,10 +261,7 @@ func (vol *Vol) checkDataPartitions(c *Cluster) (cnt int) { cnt++ } dp.checkDiskError(c.Name, c.leaderInfo.addr) - tasks := dp.checkReplicationTask(c.Name, vol.dataPartitionSize) - if len(tasks) != 0 { - c.addDataNodeTasks(tasks) - } + dp.checkReplicationTask(c, vol.dataPartitionSize) } return } @@ -301,9 +306,29 @@ func (vol *Vol) checkReplicaNum(c *Cluster) { } vol.NeedToLowerReplica = false } +func (vol *Vol) checkRepairMetaPartitions(c *Cluster) { + var err error + mps := vol.cloneMetaPartitionMap() + for _, mp := range mps { + if err = mp.RepairZone(vol, c); err != nil { + log.LogErrorf("action[checkRepairMetaPartitions],vol[%v],partitionID[%v],err[%v]", vol.Name, mp.PartitionID, err) + continue + } + } +} + +func (vol *Vol) checkRepairDataPartitions(c *Cluster) { + var err error + dps := vol.cloneDataPartitionMap() + for _, dp := range dps { + if err = dp.RepairZone(vol, c); err != nil { + log.LogErrorf("action[checkRepairDataPartitions],vol[%v],partitionID[%v],err[%v]", vol.Name, dp.PartitionID, err) + continue + } + } +} func (vol *Vol) checkMetaPartitions(c *Cluster) { - var tasks []*proto.AdminTask vol.checkSplitMetaPartition(c) maxPartitionID := vol.maxPartitionID() mps := vol.cloneMetaPartitionMap() @@ -324,9 +349,8 @@ func (vol *Vol) checkMetaPartitions(c *Cluster) { mp.checkReplicaNum(c, vol.Name, vol.mpReplicaNum) mp.checkEnd(c, maxPartitionID) mp.reportMissingReplicas(c.Name, c.leaderInfo.addr, defaultMetaPartitionTimeOutSec, defaultIntervalToAlarmMissingMetaPartition) - tasks = append(tasks, mp.replicaCreationTasks(c.Name, vol.Name)...) + mp.replicaCreationTasks(c, vol.Name) } - c.addMetaNodeTasks(tasks) } func (vol *Vol) checkSplitMetaPartition(c *Cluster) { @@ -755,7 +779,7 @@ func (vol *Vol) doCreateMetaPartition(c *Cluster, start, end uint64) (mp *MetaPa wg sync.WaitGroup ) errChannel := make(chan error, vol.mpReplicaNum) - if hosts, peers, err = c.chooseTargetMetaHosts("", nil, nil, int(vol.mpReplicaNum), vol.crossZone, vol.zoneName); err != nil { + if hosts, peers, err = c.chooseTargetMetaHosts("", nil, nil, int(vol.mpReplicaNum), vol.zoneName); err != nil { log.LogErrorf("action[doCreateMetaPartition] chooseTargetMetaHosts err[%v]", err) return nil, errors.NewError(err) } diff --git a/master/vol_test.go b/master/vol_test.go index 660a7a5fe6..094b0d198b 100644 --- a/master/vol_test.go +++ b/master/vol_test.go @@ -5,6 +5,7 @@ import ( "github.com/chubaofs/chubaofs/proto" "github.com/chubaofs/chubaofs/util" "github.com/chubaofs/chubaofs/util/log" + "strings" "testing" "time" ) @@ -39,7 +40,7 @@ func TestCheckVol(t *testing.T) { func TestVol(t *testing.T) { capacity := 300 name := "test1" - createVol(name, t) + createVol(name, testZone2, t) //report mp/dp info to master server.cluster.checkDataNodeHeartbeat() server.cluster.checkDataNodeHeartbeat() @@ -56,7 +57,7 @@ func TestVol(t *testing.T) { } vol.checkStatus(server.cluster) getVol(name, t) - updateVol(name, capacity, t) + updateVol(name, "", capacity, t) statVol(name, t) markDeleteVol(name, t) getSimpleVol(name, t) @@ -64,8 +65,9 @@ func TestVol(t *testing.T) { vol.deleteVolFromStore(server.cluster) } -func createVol(name string, t *testing.T) { - reqURL := fmt.Sprintf("%v%v?name=%v&replicas=3&type=extent&capacity=100&owner=cfs&mpCount=2&zoneName=%v", hostAddr, proto.AdminCreateVol, name, testZone2) + +func createVol(name, zone string, t *testing.T) { + reqURL := fmt.Sprintf("%v%v?name=%v&replicas=3&type=extent&capacity=100&owner=cfs&mpCount=2&zoneName=%v", hostAddr, proto.AdminCreateVol, name, zone) fmt.Println(reqURL) process(reqURL, t) vol, err := server.cluster.getVol(name) @@ -77,6 +79,158 @@ func createVol(name string, t *testing.T) { checkMetaPartitionsWritableTest(vol, t) } +func TestVolMultiZoneDowngrade(t *testing.T) { + var vol *Vol + var err error + testMultiZone := "multiZoneDowngrade" + zoneList := []string{testZone1, testZone2, testZone3} + zone := strings.Join(zoneList, ",") + fmt.Printf(strings.Join(zoneList, ",")) + server.cluster.t.putZoneIfAbsent(newZone(testZone3)) + createVol(testMultiZone, zone, t) + //report mp/dp info to master + server.cluster.checkDataNodeHeartbeat() + server.cluster.checkDataNodeHeartbeat() + time.Sleep(3 * time.Second) + //check status + server.cluster.checkMetaPartitions() + server.cluster.checkDataPartitions() + server.cluster.checkLoadMetaPartitions() + server.cluster.doLoadDataPartitions() + vol, err = server.cluster.getVol(testMultiZone) + if err != nil { + t.Errorf("err is %v", err) + return + } + + vol.checkStatus(server.cluster) + getVol(testMultiZone, t) + updateVol(testMultiZone, zone, 200, t) + statVol(testMultiZone, t) + + // add meta node + addMetaServer(mms7Addr, testZone3) + addMetaServer(mms8Addr, testZone3) + // add data node + addDataServer(mds7Addr, testZone3) + addDataServer(mds8Addr, testZone3) + time.Sleep(3 * time.Second) + server.cluster.cfg = newClusterConfig() + + server.cluster.checkDataNodeHeartbeat() + server.cluster.checkMetaNodeHeartbeat() + + server.cluster.checkVolRepairDataPartitions() + server.cluster.checkVolRepairMetaPartitions() + + /*time.Sleep(time.Second * 10) + var mps map[uint64]*MetaPartition + mps = vol.cloneMetaPartitionMap() + var isRecover bool + if isRecover, err = checkZoneRecover(mps, zoneList, t); err != nil { + t.Errorf("err is %v", err) + } + if isRecover { + t.Errorf("checkVolRepairMetaPartition is forbidden when recover pool size equals -1") + }*/ + //test normal recover + server.cluster.cfg.MetaPartitionsRecoverPoolSize = maxMetaPartitionsRecoverPoolSize + server.cluster.cfg.DataPartitionsRecoverPoolSize = maxDataPartitionsRecoverPoolSize + server.cluster.checkVolRepairDataPartitions() + server.cluster.checkVolRepairMetaPartitions() + //wait for the partitions to be repaired + /*time.Sleep(time.Second * 10) + mps = vol.cloneMetaPartitionMap() + if isRecover, err = checkZoneRecover(mps, zoneList, t); err != nil { + t.Errorf("err is %v", err) + } + if !isRecover { + t.Errorf("checkVolRepairMetaPartition recover failed") + }*/ + markDeleteVol(testMultiZone, t) + getSimpleVol(testMultiZone, t) + vol.checkStatus(server.cluster) + vol.deleteVolFromStore(server.cluster) +} + +func checkZoneRecover(mps map[uint64]*MetaPartition, zoneList []string, t *testing.T) (isRecover bool, err error) { + var curZone []string + isRecover = true + for _, mp := range mps { + curZone = make([]string, 0) + for _, host := range mp.Hosts { + var mn *MetaNode + if mn, err = server.cluster.metaNode(host); err != nil { + return + } + if !contains(curZone, mn.ZoneName) { + curZone = append(curZone, mn.ZoneName) + } + } + if len(curZone) != len(zoneList) { + t.Logf("vol[%v], meta partition[%v] recover from downgrade failed, curZone:%v, zoneList:%v", mp.volName, mp.PartitionID, curZone, zoneList) + isRecover = false + continue + } + t.Logf("vol[%v], meta partition[%v] recover from downgrade successfully!", mp.volName, mp.PartitionID) + } + return +} +func TestVolMultiZone(t *testing.T) { + var vol *Vol + var err error + testMultiZone := "multiZone" + zoneList := []string{testZone1, testZone2, testZone3} + zone := strings.Join(zoneList, ",") + fmt.Printf(strings.Join(zoneList, ",")) + + createVol(testMultiZone, zone, t) + //report mp/dp info to master + server.cluster.checkDataNodeHeartbeat() + server.cluster.checkMetaNodeHeartbeat() + time.Sleep(3 * time.Second) + //check status + server.cluster.checkMetaPartitions() + server.cluster.checkDataPartitions() + server.cluster.checkLoadMetaPartitions() + server.cluster.doLoadDataPartitions() + vol, err = server.cluster.getVol(testMultiZone) + if err != nil { + t.Errorf("err is %v", err) + return + } + vol.checkStatus(server.cluster) + getVol(testMultiZone, t) + updateVol(testMultiZone, testZone1+","+testZone2, 200, t) + statVol(testMultiZone, t) + //check repair the first replica + server.cluster.checkVolRepairDataPartitions() + server.cluster.checkVolRepairMetaPartitions() + //set partition isRecovering to false + server.cluster.checkDiskRecoveryProgress() + server.cluster.checkMigratedDataPartitionsRecoveryProgress() + server.cluster.checkMetaPartitionRecoveryProgress() + server.cluster.checkMigratedMetaPartitionRecoveryProgress() + //check repair the second replica, so all replicas should have been repaired + server.cluster.checkVolRepairDataPartitions() + server.cluster.checkVolRepairMetaPartitions() + //wait for the partitions to be repaired + /*time.Sleep(time.Second * 5) + mps := vol.cloneMetaPartitionMap() + var isRecover bool + if isRecover, err = checkZoneRecover(mps, []string{testZone1, testZone2}, t); err != nil { + t.Errorf("err is %v", err) + } + if !isRecover { + t.Errorf("checkVolRepairMetaPartition recover failed") + }*/ + + markDeleteVol(testMultiZone, t) + getSimpleVol(testMultiZone, t) + vol.checkStatus(server.cluster) + vol.deleteVolFromStore(server.cluster) +} + func checkDataPartitionsWritableTest(vol *Vol, t *testing.T) { if len(vol.dataPartitions.partitions) == 0 { return @@ -130,9 +284,9 @@ func getVol(name string, t *testing.T) { process(reqURL, t) } -func updateVol(name string, capacity int, t *testing.T) { - reqURL := fmt.Sprintf("%v%v?name=%v&capacity=%v&authKey=%v", - hostAddr, proto.AdminUpdateVol, name, capacity, buildAuthKey("cfs")) +func updateVol(name, zone string, capacity int, t *testing.T) { + reqURL := fmt.Sprintf("%v%v?name=%v&capacity=%v&authKey=%v&zoneName=%v", + hostAddr, proto.AdminUpdateVol, name, capacity, buildAuthKey("cfs"), zone) fmt.Println(reqURL) process(reqURL, t) vol, err := server.cluster.getVol(name) @@ -144,6 +298,13 @@ func updateVol(name string, capacity int, t *testing.T) { t.Errorf("update vol failed,expect[%v],real[%v]", capacity, vol.Capacity) return } + if zone == "" { + return + } + if vol.zoneName != zone { + t.Errorf("update vol failed,expect[%v],real[%v]", zone, vol.zoneName) + return + } } func statVol(name string, t *testing.T) { @@ -213,7 +374,7 @@ func TestConcurrentReadWriteDataPartitionMap(t *testing.T) { var volID uint64 = 1 var createTime = time.Now().Unix() vol := newVol(volID, name, name, "", util.DefaultDataPartitionSize, 100, defaultReplicaNum, - defaultReplicaNum, false, false, false, false, createTime, "") + defaultReplicaNum, false, false, false, true, createTime, "") // unavailable mp mp1 := newMetaPartition(1, 1, defaultMaxMetaPartitionInodeID, 3, name, volID) vol.addMetaPartition(mp1) diff --git a/metanode/api_handler.go b/metanode/api_handler.go index 948bab072b..43d76c8f89 100644 --- a/metanode/api_handler.go +++ b/metanode/api_handler.go @@ -59,9 +59,21 @@ func (m *MetaNode) registerAPIHandler() (err error) { http.HandleFunc("/getDirectory", m.getDirectoryHandler) http.HandleFunc("/getAllDentry", m.getAllDentriesHandler) http.HandleFunc("/getParams", m.getParamsHandler) + http.HandleFunc("/getDiskStat", m.getDiskStatHandler) + return } +func (m *MetaNode) getDiskStatHandler(w http.ResponseWriter, + r *http.Request) { + resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK)) + resp.Data = m.getDiskStat() + data, _ := resp.Marshal() + if _, err := w.Write(data); err != nil { + log.LogErrorf("[getPartitionsHandler] response %s", err) + } +} + func (m *MetaNode) getParamsHandler(w http.ResponseWriter, r *http.Request) { resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK)) diff --git a/metanode/const.go b/metanode/const.go index 1dd7b034dd..f5e9ae046e 100644 --- a/metanode/const.go +++ b/metanode/const.go @@ -147,6 +147,7 @@ const ( cfgDeleteBatchCount = "deleteBatchCount" cfgTotalMem = "totalMem" cfgZoneName = "zoneName" + cfgTickIntervalMs = "tickIntervalMs" metaNodeDeleteBatchCountKey = "batchCount" ) diff --git a/metanode/disk.go b/metanode/disk.go new file mode 100644 index 0000000000..0b51167362 --- /dev/null +++ b/metanode/disk.go @@ -0,0 +1,106 @@ +// Copyright 2018 The Chubao Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package metanode + +import ( + "sync" + "syscall" + "time" + + "github.com/chubaofs/chubaofs/util/log" +) + +// Disk represents the structure of the disk +type Disk struct { + sync.RWMutex + Path string + Total float64 + Used float64 + Available float64 + + stopCh chan struct{} +} + +func NewDisk(path string) (d *Disk) { + d = new(Disk) + d.Path = path + d.stopCh = make(chan struct{}, 1) + d.computeUsage() + d.startScheduleToUpdateSpaceInfo() + return +} + +// Compute the disk usage +func (d *Disk) computeUsage() (err error) { + d.RLock() + defer d.RUnlock() + fs := syscall.Statfs_t{} + err = syscall.Statfs(d.Path, &fs) + if err != nil { + return + } + + d.Total = float64(fs.Blocks) * float64(fs.Bsize) + d.Available = float64(fs.Bavail) * float64(fs.Bsize) + d.Used = d.Total - d.Available + + log.LogDebugf("action[computeUsage] disk(%v) all(%v) available(%v) used(%v)", d.Path, d.Total, d.Available, d.Used) + + return +} + +func (d *Disk) startScheduleToUpdateSpaceInfo() { + go func() { + updateSpaceInfoTicker := time.NewTicker(10 * time.Second) + defer func() { + updateSpaceInfoTicker.Stop() + }() + for { + select { + case <-d.stopCh: + log.LogInfof("[MetaNode]stop disk: %v stat \n", d.Path) + break + case <-updateSpaceInfoTicker.C: + d.computeUsage() + } + } + }() +} + +func (d *Disk) stopScheduleToUpdateSpaceInfo() { + d.stopCh <- struct{}{} +} + +func (m *MetaNode) startDiskStat() error { + m.disks = make(map[string]*Disk) + m.disks[m.metadataDir] = NewDisk(m.metadataDir) + m.disks[m.raftDir] = NewDisk(m.raftDir) + return nil +} + +func (m *MetaNode) stopDiskStat() { + for _, d := range m.disks { + d.stopScheduleToUpdateSpaceInfo() + } +} + +func (m *MetaNode) getDiskStat() []*Disk { + ds := make([]*Disk, 0) + for _, d := range m.disks { + ds = append(ds, d) + } + + return ds +} diff --git a/metanode/manager.go b/metanode/manager.go index 8c56f65b85..9f653c1045 100644 --- a/metanode/manager.go +++ b/metanode/manager.go @@ -17,6 +17,13 @@ package metanode import ( "encoding/json" "fmt" + "github.com/chubaofs/chubaofs/cmd/common" + "github.com/chubaofs/chubaofs/proto" + "github.com/chubaofs/chubaofs/raftstore" + "github.com/chubaofs/chubaofs/util" + "github.com/chubaofs/chubaofs/util/errors" + "github.com/chubaofs/chubaofs/util/exporter" + "github.com/chubaofs/chubaofs/util/log" "io/ioutil" "net" syslog "log" @@ -27,14 +34,7 @@ import ( "strings" "sync" "sync/atomic" - - "github.com/chubaofs/chubaofs/cmd/common" - "github.com/chubaofs/chubaofs/proto" - "github.com/chubaofs/chubaofs/raftstore" - "github.com/chubaofs/chubaofs/util" - "github.com/chubaofs/chubaofs/util/errors" - "github.com/chubaofs/chubaofs/util/exporter" - "github.com/chubaofs/chubaofs/util/log" + "time" ) const partitionPrefix = "partition_" @@ -120,7 +120,7 @@ func (m *metadataManager) HandleMetadataOperation(conn net.Conn, p *Packet, case proto.OpMetaLookup: err = m.opMetaLookup(conn, p, remoteAddr) case proto.OpDeleteMetaPartition: - err = m.opDeleteMetaPartition(conn, p, remoteAddr) + err = m.opExpiredMetaPartition(conn, p, remoteAddr) case proto.OpUpdateMetaPartition: err = m.opUpdateMetaPartition(conn, p, remoteAddr) case proto.OpLoadMetaPartition: @@ -246,9 +246,11 @@ func (m *metadataManager) loadPartitions() (err error) { // Check metadataDir directory fileInfo, err := os.Stat(m.rootDir) if err != nil { - os.MkdirAll(m.rootDir, 0755) - err = nil - return + if os.IsNotExist(err) { + err = os.MkdirAll(m.rootDir, 0755) + } else { + return err + } } if !fileInfo.IsDir() { err = errors.New("metadataDir must be directory") @@ -257,7 +259,7 @@ func (m *metadataManager) loadPartitions() (err error) { // scan the data directory fileInfoList, err := ioutil.ReadDir(m.rootDir) if err != nil { - return + return err } var wg sync.WaitGroup for _, fileInfo := range fileInfoList { @@ -267,8 +269,27 @@ func (m *metadataManager) loadPartitions() (err error) { log.LogErrorf("loadPartitions: find expired partition[%s], rename it and you can delete him manually", fileInfo.Name()) oldName := path.Join(m.rootDir, fileInfo.Name()) - newName := path.Join(m.rootDir, ExpiredPartitionPrefix+fileInfo.Name()) - os.Rename(oldName, newName) + newName := path.Join(m.rootDir, ExpiredPartitionPrefix+fileInfo.Name()+"_"+strconv.FormatInt(time.Now().Unix(), 10)) + if tempErr := os.Rename(oldName, newName); tempErr != nil { + log.LogErrorf("rename file has err:[%s]", tempErr.Error()) + } + + if len(fileInfo.Name()) > 10 && strings.HasPrefix(fileInfo.Name(), partitionPrefix) { + log.LogErrorf("loadPartitions: find expired partition[%s], rename raft file", + fileInfo.Name()) + partitionId := fileInfo.Name()[len(partitionPrefix):] + oldRaftName := path.Join(m.metaNode.raftDir, partitionId) + newRaftName := path.Join(m.metaNode.raftDir, ExpiredPartitionPrefix+partitionId+"_"+strconv.FormatInt(time.Now().Unix(), 10)) + log.LogErrorf("loadPartitions: find expired try rename raft file [%s] -> [%s]", oldRaftName, newRaftName) + if _, tempErr := os.Stat(oldRaftName); tempErr != nil { + log.LogWarnf("stat file [%s] has err:[%s]", oldRaftName, tempErr.Error()) + } else { + if tempErr := os.Rename(oldRaftName, newRaftName); tempErr != nil { + log.LogErrorf("rename file has err:[%s]", tempErr.Error()) + } + } + } + continue } @@ -421,6 +442,18 @@ func (m *metadataManager) deletePartition(id uint64) (err error) { return } +func (m *metadataManager) expiredPartition(id uint64) (err error) { + m.mu.Lock() + defer m.mu.Unlock() + mp, has := m.partitions[id] + if !has { + return + } + mp.Expired() + delete(m.partitions, id) + return +} + // Range scans all the meta partitions. func (m *metadataManager) Range(f func(i uint64, p MetaPartition) bool) { m.mu.RLock() diff --git a/metanode/manager_op.go b/metanode/manager_op.go index f464b1b3c6..e6ebfcbcf1 100644 --- a/metanode/manager_op.go +++ b/metanode/manager_op.go @@ -18,15 +18,14 @@ import ( "bytes" "encoding/json" "fmt" - "net" - "os" - "runtime" - "github.com/chubaofs/chubaofs/proto" "github.com/chubaofs/chubaofs/util" "github.com/chubaofs/chubaofs/util/errors" "github.com/chubaofs/chubaofs/util/log" raftProto "github.com/tiglabs/raft/proto" + "net" + "os" + "runtime" ) const ( @@ -622,7 +621,40 @@ func (m *metadataManager) opMetaExtentsTruncate(conn net.Conn, p *Packet, } // Delete a meta partition. -func (m *metadataManager) opDeleteMetaPartition(conn net.Conn, +//func (m *metadataManager) opDeleteMetaPartition(conn net.Conn, +// p *Packet, remoteAddr string) (err error) { +// req := &proto.DeleteMetaPartitionRequest{} +// adminTask := &proto.AdminTask{ +// Request: req, +// } +// decode := json.NewDecoder(bytes.NewBuffer(p.Data)) +// decode.UseNumber() +// if err = decode.Decode(adminTask); err != nil { +// p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error())) +// m.respondToClient(conn, p) +// return +// } +// mp, err := m.getPartition(req.PartitionID) +// if err != nil { +// p.PacketOkReply() +// m.respondToClient(conn, p) +// return +// } +// // Ack the master request +// conf := mp.GetBaseConfig() +// mp.Stop() +// mp.DeleteRaft() +// m.deletePartition(mp.GetBaseConfig().PartitionId) +// os.RemoveAll(conf.RootDir) +// p.PacketOkReply() +// m.respondToClient(conn, p) +// runtime.GC() +// log.LogInfof("%s [opDeleteMetaPartition] req: %d - %v, resp: %v", +// remoteAddr, p.GetReqID(), req, err) +// return +//} + +func (m *metadataManager) opExpiredMetaPartition(conn net.Conn, p *Packet, remoteAddr string) (err error) { req := &proto.DeleteMetaPartitionRequest{} adminTask := &proto.AdminTask{ @@ -643,11 +675,8 @@ func (m *metadataManager) opDeleteMetaPartition(conn net.Conn, return } // Ack the master request - conf := mp.GetBaseConfig() - mp.Stop() - mp.DeleteRaft() - m.deletePartition(mp.GetBaseConfig().PartitionId) - os.RemoveAll(conf.RootDir) + mp.ExpiredRaft() + m.expiredPartition(mp.GetBaseConfig().PartitionId) p.PacketOkReply() m.respondToClient(conn, p) runtime.GC() @@ -806,6 +835,7 @@ func (m *metadataManager) opAddMetaPartitionRaftMember(conn net.Conn, } mp, err := m.getPartition(req.PartitionId) if err != nil { + log.LogErrorf("get parititon has err by id:[%d] err:[%s]", req.PartitionId, err.Error()) p.PacketErrorWithBody(proto.OpTryOtherAddr, ([]byte)(proto.ErrMetaPartitionNotExists.Error())) m.respondToClient(conn, p) return err @@ -862,6 +892,7 @@ func (m *metadataManager) opRemoveMetaPartitionRaftMember(conn net.Conn, m.respondToClient(conn, p) return err } + req.ReserveResource = adminTask.ReserveResource mp, err := m.getPartition(req.PartitionId) if err != nil { p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error())) diff --git a/metanode/metanode.go b/metanode/metanode.go index f0e0076d3e..ea7576f76e 100644 --- a/metanode/metanode.go +++ b/metanode/metanode.go @@ -55,8 +55,10 @@ type MetaNode struct { raftStore raftstore.RaftStore raftHeartbeatPort string raftReplicatePort string + tickInterval int zoneName string httpStopC chan uint8 + disks map[string]*Disk control common.Control } @@ -110,6 +112,9 @@ func doStart(s common.Server, cfg *config.Config) (err error) { if err = m.parseConfig(cfg); err != nil { return } + if err = m.startDiskStat(); err != nil { + return + } if err = m.register(); err != nil { return } @@ -151,6 +156,7 @@ func doShutdown(s common.Server) { m.stopServer() m.stopMetaManager() m.stopRaftServer() + m.stopDiskStat() } // Sync blocks the invoker's goroutine until the meta node shuts down. @@ -173,6 +179,12 @@ func (m *MetaNode) parseConfig(cfg *config.Config) (err error) { m.zoneName = cfg.GetString(cfgZoneName) configTotalMem, _ = strconv.ParseUint(cfg.GetString(cfgTotalMem), 10, 64) + m.tickInterval = int(cfg.GetFloat(cfgTickIntervalMs)) + if m.tickInterval <= 300 { + log.LogWarnf("get config [%s]:[%v] less than 300 so set it to 500 ", cfgTickIntervalMs, cfg.GetString(cfgTickIntervalMs)) + m.tickInterval = 500 + } + if configTotalMem == 0 { return fmt.Errorf("bad totalMem config,Recommended to be configured as 80 percent of physical machine memory") } diff --git a/metanode/partition.go b/metanode/partition.go index 6b06968b63..e30889cea7 100644 --- a/metanode/partition.go +++ b/metanode/partition.go @@ -21,6 +21,7 @@ import ( "strconv" "strings" "sync/atomic" + "time" "fmt" "io/ioutil" @@ -175,8 +176,10 @@ type OpPartition interface { PersistMetadata() (err error) ChangeMember(changeType raftproto.ConfChangeType, peer raftproto.Peer, context []byte) (resp interface{}, err error) Reset() (err error) + Expired() error UpdatePartition(req *UpdatePartitionReq, resp *UpdatePartitionResp) (err error) DeleteRaft() error + ExpiredRaft() error IsExsitPeer(peer proto.Peer) bool TryToLeader(groupID uint64) error CanRemoveRaftMember(peer proto.Peer) error @@ -549,6 +552,12 @@ func (mp *metaPartition) DeleteRaft() (err error) { return } +// ExpiredRaft deletes the raft partition. +func (mp *metaPartition) ExpiredRaft() (err error) { + err = mp.raftPartition.Expired() + return +} + // Return a new inode ID and update the offset. func (mp *metaPartition) nextInodeID() (inodeId uint64, err error) { for { @@ -668,6 +677,35 @@ func (mp *metaPartition) Reset() (err error) { return } + + +func (mp *metaPartition) Expired() (err error) { + mp.stop() + if mp.delInodeFp != nil { + // TODO Unhandled errors + mp.delInodeFp.Sync() + mp.delInodeFp.Close() + } + + mp.inodeTree.Reset() + mp.dentryTree.Reset() + mp.config.Cursor = 0 + mp.applyID = 0 + + currentPath := path.Clean(mp.config.RootDir) + + var newPath = path.Join(path.Dir(currentPath), + ExpiredPartitionPrefix+path.Base(currentPath)+"_"+strconv.FormatInt(time.Now().Unix(), 10)) + + if err := os.Rename(currentPath, newPath); err != nil { + log.LogErrorf("ExpiredPartition: mark expired partition fail: partitionID(%v) path(%v) newPath(%v) err(%v)", mp.config.PartitionId, currentPath, newPath, err) + return err + } + log.LogInfof("ExpiredPartition: mark expired partition: partitionID(%v) path(%v) newPath(%v)", + mp.config.PartitionId, currentPath, newPath) + return nil +} + // func (mp *metaPartition) canRemoveSelf() (canRemove bool, err error) { var partition *proto.MetaPartitionInfo diff --git a/metanode/partition_fsm.go b/metanode/partition_fsm.go index d6d01d8d98..e65c395e08 100644 --- a/metanode/partition_fsm.go +++ b/metanode/partition_fsm.go @@ -257,7 +257,9 @@ func (mp *metaPartition) ApplySnapshot(peers []raftproto.Peer, iter raftproto.Sn mp.dentryTree = dentryTree mp.extendTree = extendTree mp.multipartTree = multipartTree - mp.config.Cursor = cursor + if cursor != 0 { + mp.config.Cursor = cursor + } err = nil // store message mp.storeChan <- &storeMsg{ @@ -269,7 +271,7 @@ func (mp *metaPartition) ApplySnapshot(peers []raftproto.Peer, iter raftproto.Sn multipartTree: mp.multipartTree, } mp.extReset <- struct{}{} - log.LogDebugf("ApplySnapshot: finish with EOF: partitionID(%v) applyID(%v)", mp.config.PartitionId, mp.applyID) + log.LogDebugf("ApplySnapshot: finish with EOF: partitionID(%v) applyID(%v),cursor(%v)", mp.config.PartitionId, mp.applyID, mp.config.Cursor) return } log.LogErrorf("ApplySnapshot: stop with error: partitionID(%v) err(%v)", mp.config.PartitionId, err) diff --git a/metanode/partition_fsmop.go b/metanode/partition_fsmop.go index 2f530806e6..5800c0cb54 100644 --- a/metanode/partition_fsmop.go +++ b/metanode/partition_fsmop.go @@ -130,14 +130,18 @@ func (mp *metaPartition) confRemoveNode(req *proto.RemoveMetaPartitionRaftMember } mp.config.Peers = append(mp.config.Peers[:peerIndex], mp.config.Peers[peerIndex+1:]...) if mp.config.NodeId == req.RemovePeer.ID && !mp.isLoadingMetaPartition && canRemoveSelf { - mp.Stop() - mp.DeleteRaft() - mp.manager.deletePartition(mp.GetBaseConfig().PartitionId) - os.RemoveAll(mp.config.RootDir) + mp.ExpiredRaft() + mp.manager.expiredPartition(mp.GetBaseConfig().PartitionId) updated = false } log.LogInfof("Fininsh RemoveRaftNode PartitionID(%v) nodeID(%v) do RaftLog (%v) ", req.PartitionId, mp.config.NodeId, string(data)) + + return +} + +func (mp *metaPartition) confUpdateNode(req *proto.MetaPartitionDecommissionRequest, + index uint64) (updated bool, err error) { return } diff --git a/metanode/raft_server.go b/metanode/raft_server.go index 3970c26e46..c9a2abe27a 100644 --- a/metanode/raft_server.go +++ b/metanode/raft_server.go @@ -37,6 +37,7 @@ func (m *MetaNode) startRaftServer() (err error) { raftConf := &raftstore.Config{ NodeID: m.nodeId, RaftPath: m.raftDir, + TickInterval: m.tickInterval, IPAddr: m.localAddr, HeartbeatPort: heartbeatPort, ReplicaPort: replicaPort, diff --git a/proto/admin_proto.go b/proto/admin_proto.go index 5960008d43..db283d4b9a 100644 --- a/proto/admin_proto.go +++ b/proto/admin_proto.go @@ -39,6 +39,7 @@ const ( AdminListVols = "/vol/list" AdminSetNodeInfo = "/admin/setNodeInfo" AdminGetNodeInfo = "/admin/getNodeInfo" + AdminSetNodeState = "/admin/setNodeState" //graphql master api AdminClusterAPI = "/api/cluster" @@ -201,8 +202,9 @@ type AddDataPartitionRaftMemberRequest struct { // RemoveDataPartitionRaftMemberRequest defines the request of add raftMember a data partition. type RemoveDataPartitionRaftMemberRequest struct { - PartitionId uint64 - RemovePeer Peer + PartitionId uint64 + RemovePeer Peer + ReserveResource bool } // AddMetaPartitionRaftMemberRequest defines the request of add raftMember a meta partition. @@ -213,8 +215,9 @@ type AddMetaPartitionRaftMemberRequest struct { // RemoveMetaPartitionRaftMemberRequest defines the request of add raftMember a meta partition. type RemoveMetaPartitionRaftMemberRequest struct { - PartitionId uint64 - RemovePeer Peer + PartitionId uint64 + RemovePeer Peer + ReserveResource bool } // LoadDataPartitionRequest defines the request of loading a data partition. @@ -492,6 +495,7 @@ type SimpleVolView struct { NeedToLowerReplica bool Authenticate bool CrossZone bool + AutoRepair bool CreateTime string EnableToken bool Tokens map[string]*Token `graphql:"-"` diff --git a/proto/admin_task.go b/proto/admin_task.go index 6ac0a3964d..6892fee29a 100644 --- a/proto/admin_task.go +++ b/proto/admin_task.go @@ -31,16 +31,17 @@ const ( // AdminTask defines the administration task. type AdminTask struct { - ID string - PartitionID uint64 - OpCode uint8 - OperatorAddr string - Status int8 - SendTime int64 - CreateTime int64 - SendCount uint8 - Request interface{} - Response interface{} + ID string + PartitionID uint64 + OpCode uint8 + OperatorAddr string + Status int8 + SendTime int64 + CreateTime int64 + SendCount uint8 + ReserveResource bool + Request interface{} + Response interface{} } // ToString returns the string format of the task. diff --git a/proto/model.go b/proto/model.go index d502e7daa9..82f74e9350 100644 --- a/proto/model.go +++ b/proto/model.go @@ -39,6 +39,8 @@ type MetaNodeInfo struct { MetaPartitionCount int NodeSetID uint64 PersistenceMetaPartitions []uint64 + ToBeOffline bool + ToBeMigrated bool } // DataNode stores all the information about a data node @@ -59,6 +61,8 @@ type DataNodeInfo struct { NodeSetID uint64 PersistenceDataPartitions []uint64 BadDisks []string + ToBeOffline bool + ToBeMigrated bool } // MetaPartition defines the structure of a meta partition @@ -84,29 +88,35 @@ type MetaPartitionInfo struct { // MetaReplica defines the replica of a meta partition type MetaReplicaInfo struct { - Addr string - ReportTime int64 - Status int8 // unavailable, readOnly, readWrite - IsLeader bool + Addr string + ReportTime int64 + Status int8 // unavailable, readOnly, readWrite + IsLeader bool + InodeCount uint64 + DentryCount uint64 } // ClusterView provides the view of a cluster. type ClusterView struct { - Name string - LeaderAddr string - DisableAutoAlloc bool - MetaNodeThreshold float32 - Applied uint64 - MaxDataPartitionID uint64 - MaxMetaNodeID uint64 - MaxMetaPartitionID uint64 - DataNodeStatInfo *NodeStatInfo - MetaNodeStatInfo *NodeStatInfo - VolStatInfo []*VolStatInfo - BadPartitionIDs []BadPartitionView - BadMetaPartitionIDs []BadPartitionView - MetaNodes []NodeView - DataNodes []NodeView + Name string + LeaderAddr string + DisableAutoAlloc bool + MetaNodeThreshold float32 + DpRecoverPool int32 + MpRecoverPool int32 + Applied uint64 + MaxDataPartitionID uint64 + MaxMetaNodeID uint64 + MaxMetaPartitionID uint64 + DataNodeStatInfo *NodeStatInfo + MetaNodeStatInfo *NodeStatInfo + VolStatInfo []*VolStatInfo + BadPartitionIDs []BadPartitionView + BadMetaPartitionIDs []BadPartitionView + MigratedDataPartitions []BadPartitionView + MigratedMetaPartitions []BadPartitionView + MetaNodes []NodeView + DataNodes []NodeView } // NodeView provides the view of the data or meta node. @@ -217,3 +227,64 @@ type MetaPartitionDiagnosis struct { LackReplicaMetaPartitionIDs []uint64 BadMetaPartitionIDs []BadPartitionView } +type ExtentInfo struct { + FileID uint64 `json:"fileId"` + Size uint64 `json:"size"` + Crc uint32 `json:"Crc"` + IsDeleted bool `json:"deleted"` + ModifyTime int64 `json:"modTime"` + Source string `json:"src"` +} + +// Status raft status +type Status struct { + ID uint64 + NodeID uint64 + Leader uint64 + Term uint64 + Index uint64 + Commit uint64 + Applied uint64 + Vote uint64 + PendQueue int + RecvQueue int + AppQueue int + Stopped bool + RestoringSnapshot bool + State string // leader、follower、candidate + Replicas map[uint64]*ReplicaStatus +} + +// ReplicaStatus replica status +type ReplicaStatus struct { + Match uint64 // copy progress + Commit uint64 // commmit position + Next uint64 + State string + Snapshoting bool + Paused bool + Active bool + LastActive time.Time + Inflight int +} +type DNDataPartitionInfo struct { + VolName string `json:"volName"` + ID uint64 `json:"id"` + Size int `json:"size"` + Used int `json:"used"` + Status int `json:"status"` + Path string `json:"path"` + Files []*ExtentInfo `json:"extents"` + FileCount int `json:"fileCount"` + Replicas []string `json:"replicas"` + TinyDeleteRecordSize int64 `json:"tinyDeleteRecordSize"` + RaftStatus *Status `json:"raftStatus"` + Peers []*Peer `json:"peers"` +} + +type MNMetaPartitionInfo struct { + LeaderAddr string `json:"leaderAddr"` + Peers []*Peer `json:"peers"` + NodeId uint64 `json:"nodeId"` + Cursor uint64 `json:"cursor"` +} diff --git a/proto/packet.go b/proto/packet.go index aafa4d36af..2d9f96be24 100644 --- a/proto/packet.go +++ b/proto/packet.go @@ -156,12 +156,13 @@ const ( ) const ( - WriteDeadlineTime = 5 - ReadDeadlineTime = 5 - SyncSendTaskDeadlineTime = 20 - NoReadDeadlineTime = -1 BatchDeleteExtentReadDeadLineTime = 120 - GetAllWatermarksDeadLineTime = 60 + WriteDeadlineTime = 5 + ReadDeadlineTime = 5 + SyncSendTaskDeadlineTime = 20 + NoReadDeadlineTime = -1 + MaxWaitFollowerRepairTime = 60*30 + GetAllWatermarksDeadLineTime = 60 ) const ( diff --git a/raftstore/partition.go b/raftstore/partition.go index 5098ecfca2..ca1e2bfb92 100644 --- a/raftstore/partition.go +++ b/raftstore/partition.go @@ -16,11 +16,20 @@ package raftstore import ( "os" + "path" + "strconv" + "time" + + "github.com/chubaofs/chubaofs/util/log" "github.com/tiglabs/raft" "github.com/tiglabs/raft/proto" ) +const ( + ExpiredPartitionPrefix = "expired_" +) + // PartitionStatus is a type alias of raft.Status type PartitionStatus = raft.Status @@ -46,6 +55,9 @@ type Partition interface { // Delete stops and deletes the partition. Delete() error + // Expired stops and marks specified partition as expired. + Expired() error + // Status returns the current raft status. Status() (status *PartitionStatus) @@ -110,8 +122,31 @@ func (p *partition) Delete() (err error) { return } +// Expired stops and marks specified partition as expired. +// It renames data path to a new name which add 'expired_' as prefix and operation timestamp as suffix. +// (e.g. '/path/1' to '/path/expired_1_1600054521') +func (p *partition) Expired() (err error) { + if err = p.Stop(); err != nil { + return + } + var currentPath = path.Clean(p.walPath) + var newPath = path.Join(path.Dir(currentPath), + ExpiredPartitionPrefix+path.Base(currentPath)+"_"+strconv.FormatInt(time.Now().Unix(), 10)) + if err = os.Rename(currentPath, newPath); err != nil { + log.LogErrorf("Expired: mark expired partition fail: partitionID(%v) path(%v) newPath(%v) err(%v)", + p.id, p.walPath, newPath, err) + return + } + log.LogInfof("ExpiredPartition: mark expired partition: partitionID(%v) path(%v) newPath(%v)", + p.id, p.walPath, newPath) + return +} + // Status returns the current raft status. func (p *partition) Status() (status *PartitionStatus) { + if p == nil || p.raft == nil { + return nil + } status = p.raft.Status(p.id) return } diff --git a/sdk/master/api_admin.go b/sdk/master/api_admin.go index 68151a0059..e6ce5b377f 100644 --- a/sdk/master/api_admin.go +++ b/sdk/master/api_admin.go @@ -207,7 +207,7 @@ func (api *AdminAPI) DeleteVolume(volName, authKey string) (err error) { return } -func (api *AdminAPI) UpdateVolume(volName string, capacity uint64, replicas int, followerRead, authenticate, enableToken bool, authKey, zoneName string) (err error) { +func (api *AdminAPI) UpdateVolume(volName string, capacity uint64, replicas int, followerRead, authenticate, enableToken, autoRepair bool, authKey, zoneName string) (err error) { var request = newAPIRequest(http.MethodGet, proto.AdminUpdateVol) request.addParam("name", volName) request.addParam("authKey", authKey) @@ -216,6 +216,7 @@ func (api *AdminAPI) UpdateVolume(volName string, capacity uint64, replicas int, request.addParam("followerRead", strconv.FormatBool(followerRead)) request.addParam("enableToken", strconv.FormatBool(enableToken)) request.addParam("authenticate", strconv.FormatBool(authenticate)) + request.addParam("autoRepair", strconv.FormatBool(autoRepair)) request.addParam("zoneName", zoneName) if _, err = api.mc.serveRequest(request); err != nil { return @@ -246,7 +247,7 @@ func (api *AdminAPI) VolExpand(volName string, capacity uint64, authKey string) } func (api *AdminAPI) CreateVolume(volName, owner string, mpCount int, - dpSize uint64, capacity uint64, replicas int, followerRead bool, zoneName string) (err error) { + dpSize uint64, capacity uint64, replicas int, followerRead bool, autoRepair bool, zoneName string) (err error) { var request = newAPIRequest(http.MethodGet, proto.AdminCreateVol) request.addParam("name", volName) request.addParam("owner", owner) @@ -254,6 +255,7 @@ func (api *AdminAPI) CreateVolume(volName, owner string, mpCount int, request.addParam("size", strconv.FormatUint(dpSize, 10)) request.addParam("capacity", strconv.FormatUint(capacity, 10)) request.addParam("followerRead", strconv.FormatBool(followerRead)) + request.addParam("autoRepair", strconv.FormatBool(autoRepair)) request.addParam("zoneName", zoneName) if _, err = api.mc.serveRequest(request); err != nil { return diff --git a/sdk/master/api_node.go b/sdk/master/api_node.go index c91f456c0c..832827502d 100644 --- a/sdk/master/api_node.go +++ b/sdk/master/api_node.go @@ -16,6 +16,7 @@ package master import ( "encoding/json" + "fmt" "net/http" "strconv" @@ -124,3 +125,37 @@ func (api *NodeAPI) MetaNodeDecommission(nodeAddr string) (err error) { } return } + +func (api *NodeAPI) DataNodeGetPartition(addr string, id uint64) (node *proto.DNDataPartitionInfo, err error) { + var request = newAPIRequest(http.MethodGet, "/partition") + var buf []byte + nodeClient := NewNodeClient(fmt.Sprintf("%v:%v", addr, api.mc.DataNodeProfPort), false, DATANODE) + nodeClient.DataNodeProfPort = api.mc.DataNodeProfPort + request.addParam("id", strconv.FormatUint(id, 10)) + request.addHeader("isTimeOut", "false") + if buf, err = nodeClient.serveRequest(request); err != nil { + return + } + node = &proto.DNDataPartitionInfo{} + if err = json.Unmarshal(buf, &node); err != nil { + return + } + return +} + +func (api *NodeAPI) MetaNodeGetPartition(addr string, id uint64) (node *proto.MNMetaPartitionInfo, err error) { + var request = newAPIRequest(http.MethodGet, "/getPartitionById") + var buf []byte + nodeClient := NewNodeClient(fmt.Sprintf("%v:%v", addr, api.mc.MetaNodeProfPort), false, METANODE) + nodeClient.MetaNodeProfPort = api.mc.MetaNodeProfPort + request.addParam("pid", strconv.FormatUint(id, 10)) + request.addHeader("isTimeOut", "false") + if buf, err = nodeClient.serveRequest(request); err != nil { + return + } + node = &proto.MNMetaPartitionInfo{} + if err = json.Unmarshal(buf, &node); err != nil { + return + } + return +} diff --git a/sdk/master/client.go b/sdk/master/client.go index d56e844215..87a2cb5066 100644 --- a/sdk/master/client.go +++ b/sdk/master/client.go @@ -19,6 +19,7 @@ import ( "encoding/json" "errors" "fmt" + "github.com/chubaofs/chubaofs/proto" "io/ioutil" "net/http" "strconv" @@ -26,7 +27,6 @@ import ( "sync" "time" - "github.com/chubaofs/chubaofs/proto" "github.com/chubaofs/chubaofs/util/log" ) @@ -38,12 +38,24 @@ var ( ErrNoValidMaster = errors.New("no valid master") ) +type ClientType int + +const ( + MASTER ClientType = iota + DATANODE + METANODE +) + type MasterClient struct { sync.RWMutex - masters []string - useSSL bool - leaderAddr string timeout time.Duration + masters []string + useSSL bool + leaderAddr string + nodeAddr string + ClientType ClientType + DataNodeProfPort uint16 + MetaNodeProfPort uint16 adminAPI *AdminAPI clientAPI *ClientAPI @@ -97,8 +109,8 @@ func (c *MasterClient) SetTimeout(timeout uint16) { } func (c *MasterClient) serveRequest(r *request) (repsData []byte, err error) { - leaderAddr, nodes := c.prepareRequest() - host := leaderAddr + requestAddr, nodes := c.prepareRequest() + host := requestAddr for i := -1; i < len(nodes); i++ { if i == -1 { if host == "" { @@ -141,7 +153,7 @@ func (c *MasterClient) serveRequest(r *request) (repsData []byte, err error) { repsData, err = c.serveRequest(r) return case http.StatusOK: - if leaderAddr != host { + if requestAddr != host { c.setLeader(host) } var body = &struct { @@ -158,7 +170,20 @@ func (c *MasterClient) serveRequest(r *request) (repsData []byte, err error) { if body.Code != 0 { log.LogWarnf("serveRequest: code[%v], msg[%v], data[%v] ", body.Code, body.Msg, body.Data) return nil, proto.ParseErrorCode(body.Code) + switch c.ClientType { + case MASTER: + // o represent proto.ErrCodeSuccess + if body.Code != 0 { + return nil, proto.ParseErrorCode(body.Code) + } + case DATANODE, METANODE: + // o represent proto.ErrCodeSuccess + if body.Code != 200 { + return nil, proto.ParseErrorCode(body.Code) + } + } } + return []byte(body.Data), nil default: log.LogErrorf("serveRequest: unknown status: host(%v) uri(%v) status(%v) body(%s).", @@ -180,10 +205,16 @@ func (c *MasterClient) Nodes() (nodes []string) { // prepareRequest returns the leader address and all master addresses. func (c *MasterClient) prepareRequest() (addr string, nodes []string) { - c.RLock() - addr = c.leaderAddr - nodes = c.masters - c.RUnlock() + c.Lock() + switch c.ClientType { + case MASTER: + addr = c.leaderAddr + nodes = c.masters + case DATANODE, METANODE: + addr = c.nodeAddr + nodes = []string{addr} + } + c.Unlock() return } @@ -253,6 +284,18 @@ func (c *MasterClient) mergeRequestUrl(url string, params map[string]string) str // NewMasterHelper returns a new MasterClient instance. func NewMasterClient(masters []string, useSSL bool) *MasterClient { var mc = &MasterClient{masters: masters, useSSL: useSSL, timeout: requestTimeout} + mc.ClientType = MASTER + mc.adminAPI = &AdminAPI{mc: mc} + mc.clientAPI = &ClientAPI{mc: mc} + mc.nodeAPI = &NodeAPI{mc: mc} + mc.userAPI = &UserAPI{mc: mc} + return mc +} + +// NewMasterHelper returns a new MasterClient instance. +func NewNodeClient(node string, useSSL bool, clientType ClientType) *MasterClient { + var mc = &MasterClient{nodeAddr: node, useSSL: useSSL} + mc.ClientType = clientType mc.adminAPI = &AdminAPI{mc: mc} mc.clientAPI = &ClientAPI{mc: mc} mc.nodeAPI = &NodeAPI{mc: mc} diff --git a/storage/extent_store.go b/storage/extent_store.go index cb835e414d..ea63dfcfcc 100644 --- a/storage/extent_store.go +++ b/storage/extent_store.go @@ -421,6 +421,9 @@ func (s *ExtentStore) PutNormalExtentToDeleteCache(extentID uint64) { func (s *ExtentStore) IsDeletedNormalExtent(extentID uint64) (ok bool) { _, ok = s.hasDeleteNormalExtentsCache.Load(extentID) + s.eiMutex.Lock() + delete(s.extentInfoMap,extentID) + s.eiMutex.Unlock() return } @@ -473,35 +476,7 @@ func (s *ExtentStore) GetTinyExtentOffset(extentID uint64) (watermark int64, err return } -// Sector size -const ( - DiskSectorSize = 512 -) -func (s *ExtentStore) GetStoreUsedSize() (used int64) { - extentInfoSlice := make([]*ExtentInfo, 0, s.GetExtentCount()) - s.eiMutex.RLock() - for _, extentID := range s.extentInfoMap { - extentInfoSlice = append(extentInfoSlice, extentID) - } - s.eiMutex.RUnlock() - for _, einfo := range extentInfoSlice { - if einfo.IsDeleted { - continue - } - if IsTinyExtent(einfo.FileID) { - stat := new(syscall.Stat_t) - err := syscall.Stat(fmt.Sprintf("%v/%v", s.dataPath, einfo.FileID), stat) - if err != nil { - continue - } - used += (stat.Blocks * DiskSectorSize) - } else { - used += int64(einfo.Size) - } - } - return -} // GetAllWatermarks returns all the watermarks. func (s *ExtentStore) GetAllWatermarks(filter ExtentFilter) (extents []*ExtentInfo, tinyDeleteFileSize int64, err error) { @@ -999,3 +974,32 @@ func (s *ExtentStore) TinyExtentAvaliOffset(extentID uint64, offset int64) (newO return } + +const ( + DiskSectorSize=512 +) + +func (s *ExtentStore)GetStoreUsedSize()(used int64){ + extentInfoSlice := make([]*ExtentInfo, 0, s.GetExtentCount()) + s.eiMutex.RLock() + for _, extentID := range s.extentInfoMap { + extentInfoSlice = append(extentInfoSlice, extentID) + } + s.eiMutex.RUnlock() + for _,einfo:=range extentInfoSlice{ + if einfo.IsDeleted { + continue + } + if IsTinyExtent(einfo.FileID){ + stat := new(syscall.Stat_t) + err := syscall.Stat(fmt.Sprintf("%v/%v", s.dataPath, einfo.FileID), stat) + if err != nil { + continue + } + used +=(stat.Blocks * DiskSectorSize) + }else { + used +=int64(einfo.Size) + } + } + return +} \ No newline at end of file diff --git a/util/string.go b/util/string.go index 8ae4aaa765..ae1e158557 100644 --- a/util/string.go +++ b/util/string.go @@ -65,3 +65,40 @@ func RandomString(length int, seed RandomSeed) string { } return result } +func Intersect(string1, string2 []string) (inter []string) { + m := make(map[string]int) + for _, v := range string1 { + m[v]++ + } + + for _, v := range string2 { + times, ok := m[v] + if ok && times > 0 { + inter = append(inter, v) + m[v]-- + } + } + return +} + +func Projective(long, short []string) (result []string) { + if len(short) == 0 { + return long + } + if len(Intersect(long, short)) < len(short) { + return make([]string, 0) + } + m := make(map[string]int) + for _, v := range short { + m[v]++ + } + for _, s := range long { + times, ok := m[s] + if times > 0 && ok { + m[s]-- + } else { + result = append(result, s) + } + } + return result +} diff --git a/vendor/github.com/tiglabs/raft/raft.go b/vendor/github.com/tiglabs/raft/raft.go index 74f3292acd..ef5a4cb63f 100644 --- a/vendor/github.com/tiglabs/raft/raft.go +++ b/vendor/github.com/tiglabs/raft/raft.go @@ -109,6 +109,7 @@ type raft struct { prevHardSt proto.HardState peerState peerState pending map[uint64]*Future + pendingCmd map[uint64]proto.EntryType snapping map[uint64]*snapshotStatus mStatus *monitorStatus propc chan *proposal @@ -149,6 +150,7 @@ func newRaft(config *Config, raftConfig *RaftConfig) (*raft, error) { raftConfig: raftConfig, mStatus: mStatus, pending: make(map[uint64]*Future), + pendingCmd: make(map[uint64]proto.EntryType), snapping: make(map[uint64]*snapshotStatus), recvc: make(chan *proto.Message, config.ReqBufferSize), applyc: make(chan *apply, config.AppBufferSize), @@ -230,7 +232,9 @@ func (s *raft) runApply() { ) switch cmd := apply.command.(type) { case *proto.ConfChange: + logger.Error("raft[%v] invoke ApplyMemberChange: cmd(%v) index(%v) futre(%v)", s.raftFsm.id, cmd, apply.index, apply.future) resp, err = s.raftConfig.StateMachine.ApplyMemberChange(cmd, apply.index) + logger.Error("raft[%v] finish ApplyMemberChange: cmd(%v) index(%v) futre(%v)", s.raftFsm.id, cmd, apply.index, apply.future) case []byte: resp, err = s.raftConfig.StateMachine.Apply(cmd, apply.index) } @@ -290,6 +294,7 @@ func (s *raft) run() { msg.From = s.config.NodeID starti := s.raftFsm.raftLog.lastIndex() + 1 s.pending[starti] = pr.future + s.pendingCmd[starti] = pr.cmdType msg.Entries = append(msg.Entries, &proto.Entry{Term: s.raftFsm.term, Index: starti, Type: pr.cmdType, Data: pr.data}) pool.returnProposal(pr) @@ -299,6 +304,7 @@ func (s *raft) run() { select { case pr := <-s.propc: s.pending[starti] = pr.future + s.pendingCmd[starti] = pr.cmdType msg.Entries = append(msg.Entries, &proto.Entry{Term: s.raftFsm.term, Index: starti, Type: pr.cmdType, Data: pr.data}) pool.returnProposal(pr) default: @@ -308,6 +314,11 @@ func (s *raft) run() { break } } + for _, entry := range msg.Entries { + if entry.Type == proto.EntryConfChange { + logger.Error("raft[%v] step EntryConfChange: index(%v) term(%v)", s.raftFsm.id, entry.Index, entry.Term) + } + } s.raftFsm.Step(msg) case m := <-s.recvc: @@ -679,6 +690,7 @@ func (s *raft) apply() { if future, ok := s.pending[entry.Index]; ok { apply.future = future delete(s.pending, entry.Index) + delete(s.pendingCmd, entry.Index) } apply.readIndexes = s.raftFsm.readOnly.getReady(entry.Index) @@ -731,6 +743,7 @@ func (s *raft) resetPending(err error) { for k, v := range s.pending { v.respond(nil, err) delete(s.pending, k) + delete(s.pendingCmd, k) } } } @@ -770,6 +783,11 @@ func (s *raft) getStatus() *Status { default: } + pendingCmd := make(map[uint64]proto.EntryType) + for k, v := range s.pendingCmd { + pendingCmd[k] = v + } + st := &Status{ ID: s.raftFsm.id, NodeID: s.config.NodeID, @@ -782,6 +800,7 @@ func (s *raft) getStatus() *Status { State: s.raftFsm.state.String(), RestoringSnapshot: s.restoringSnapshot.Get(), PendQueue: len(s.pending), + PendCmd: pendingCmd, RecvQueue: len(s.recvc), AppQueue: len(s.applyc), Stopped: stopped, diff --git a/vendor/github.com/tiglabs/raft/raft_fsm.go b/vendor/github.com/tiglabs/raft/raft_fsm.go index 12e6cfad16..54a03ef9a9 100644 --- a/vendor/github.com/tiglabs/raft/raft_fsm.go +++ b/vendor/github.com/tiglabs/raft/raft_fsm.go @@ -20,9 +20,10 @@ import ( "math/rand" "strings" + "time" + "github.com/tiglabs/raft/logger" "github.com/tiglabs/raft/proto" - "time" ) // NoLeader is a placeholder nodeID used when there is no leader. diff --git a/vendor/github.com/tiglabs/raft/status.go b/vendor/github.com/tiglabs/raft/status.go index b13d50744b..45f0e44719 100644 --- a/vendor/github.com/tiglabs/raft/status.go +++ b/vendor/github.com/tiglabs/raft/status.go @@ -17,6 +17,8 @@ package raft import ( "fmt" "time" + + "github.com/tiglabs/raft/proto" ) // DownReplica down replica @@ -49,6 +51,7 @@ type Status struct { Applied uint64 Vote uint64 PendQueue int + PendCmd map[uint64]proto.EntryType RecvQueue int AppQueue int Stopped bool