[2/2] core168: Add script to automatically repair MDRAID arrays

Message ID 20220519085634.197389-2-michael.tremer@ipfire.org
State Accepted
Commit 71d53192d37db0d86a9dc04b11aa40016ba09b47
Headers
Series [1/2] core168: Add rd.auto to kernel command line |

Commit Message

Michael Tremer May 19, 2022, 8:56 a.m. UTC
  Please see the header of the script for more details.

Signed-off-by: Michael Tremer <michael.tremer@ipfire.org>
---
 config/rootfiles/common/aarch64/stage2 |   1 +
 config/rootfiles/common/armv6l/stage2  |   1 +
 config/rootfiles/common/x86_64/stage2  |   1 +
 config/rootfiles/core/168/update.sh    |   3 +
 src/scripts/repair-mdraid              | 169 +++++++++++++++++++++++++
 5 files changed, 175 insertions(+)
 create mode 100644 src/scripts/repair-mdraid
  

Patch

diff --git a/config/rootfiles/common/aarch64/stage2 b/config/rootfiles/common/aarch64/stage2
index 352c704d4..e328a4526 100644
--- a/config/rootfiles/common/aarch64/stage2
+++ b/config/rootfiles/common/aarch64/stage2
@@ -99,6 +99,7 @@  usr/local/bin/ipsec-interfaces
 usr/local/bin/makegraphs
 usr/local/bin/qosd
 usr/local/bin/readhash
+usr/local/bin/repair-mdraid
 usr/local/bin/run-parts
 usr/local/bin/scanhd
 usr/local/bin/settime
diff --git a/config/rootfiles/common/armv6l/stage2 b/config/rootfiles/common/armv6l/stage2
index 198461a01..2bd00d968 100644
--- a/config/rootfiles/common/armv6l/stage2
+++ b/config/rootfiles/common/armv6l/stage2
@@ -97,6 +97,7 @@  usr/local/bin/ipsec-interfaces
 usr/local/bin/makegraphs
 usr/local/bin/qosd
 usr/local/bin/readhash
+usr/local/bin/repair-mdraid
 usr/local/bin/run-parts
 usr/local/bin/scanhd
 usr/local/bin/settime
diff --git a/config/rootfiles/common/x86_64/stage2 b/config/rootfiles/common/x86_64/stage2
index b03a7fecf..586b88e3d 100644
--- a/config/rootfiles/common/x86_64/stage2
+++ b/config/rootfiles/common/x86_64/stage2
@@ -99,6 +99,7 @@  usr/local/bin/ipsec-interfaces
 usr/local/bin/makegraphs
 usr/local/bin/qosd
 usr/local/bin/readhash
+usr/local/bin/repair-mdraid
 usr/local/bin/run-parts
 usr/local/bin/scanhd
 usr/local/bin/settime
diff --git a/config/rootfiles/core/168/update.sh b/config/rootfiles/core/168/update.sh
index c4005dba9..84dec941c 100644
--- a/config/rootfiles/core/168/update.sh
+++ b/config/rootfiles/core/168/update.sh
@@ -125,6 +125,9 @@  if ! grep -q rd.auto /etc/default/grub; then
 	sed -e "s/panic=10/& rd.auto/" -i /etc/default/grub
 fi
 
+# Repair any broken MDRAID arrays
+/usr/local/bin/repair-mdraid
+
 # Start services
 /etc/init.d/fcron restart
 /etc/init.d/sshd restart
diff --git a/src/scripts/repair-mdraid b/src/scripts/repair-mdraid
new file mode 100644
index 000000000..a622ff71d
--- /dev/null
+++ b/src/scripts/repair-mdraid
@@ -0,0 +1,169 @@ 
+#!/bin/bash
+###############################################################################
+#                                                                             #
+# IPFire.org - A linux based firewall                                         #
+# Copyright (C) 2022 IPFire Team  <info@ipfire.org>                           #
+#                                                                             #
+# This program is free software: you can redistribute it and/or modify        #
+# it under the terms of the GNU General Public License as published by        #
+# the Free Software Foundation, either version 3 of the License, or           #
+# (at your option) any later version.                                         #
+#                                                                             #
+# This program is distributed in the hope that it will be useful,             #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of              #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
+# GNU General Public License for more details.                                #
+#                                                                             #
+# You should have received a copy of the GNU General Public License           #
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.       #
+#                                                                             #
+###############################################################################
+#
+# This script is supposed to repair any broken RAID installations
+# where the system has been booted from only one of the RAID devices
+# without the software RAID being activated first.
+#
+# This script does as follows:
+#
+# * It tries to find an inactive RAID called "ipfire:0"
+# * It will then destroy any devices that are still part of this RAID.
+#   This is required because if the RAID is being assembled correctly,
+#   data from the disk that has NOT been mounted will be replicated
+#   back to the device that has been changed. That causes that any
+#   data that has been written to the mounted disk will be lost.
+#   To avoid this, we will partially destroy the RAID.
+# * We will then erase any partition tables and destroy any filesystems
+#   on the devices so that they do not get accidentially mounted again.
+# * The system will then need to be rebooted where the RAID will be
+#   mounted again in a degraded state which might take some extra
+#   time at boot (the system stands still for about a minute).
+# * After the system has been booted up correctly, we will re-add
+#   the devices back to the RAID which will resync and the system
+#   will be back to its intended configuration.
+
+find_inactive_raid() {
+	local status
+	local device
+	local arg
+	local args
+
+	while read -r status device args; do
+		if [ "${status}" = "INACTIVE-ARRAY" ]; then
+			for arg in ${args}; do
+				case "${arg}" in
+					name=ipfire:0)
+						echo "${device}"
+						return 0
+						;;
+				esac
+			done
+		fi
+	done <<< "$(mdadm --detail --scan)"
+
+	return 1
+}
+
+find_root() {
+	local device
+	local mp
+	local fs
+	local args
+
+	while read -r device mp fs args; do
+		if [ "${mp}" = "/" ]; then
+			echo "${device:0:-1}"
+			return 0
+		fi
+	done < /proc/mounts
+
+	return 1
+}
+
+find_raid_devices() {
+	local raid="${1}"
+
+	local IFS=,
+
+	local device
+	for device in $(mdadm -v --detail --scan "${raid}" | awk -F= '/^[ ]+devices/ { print $2 }'); do
+		echo "${device}"
+	done
+
+	return 0
+}
+
+destroy_everything() {
+	local device="${1}"
+	local part
+
+	# Destroy the RAID superblock
+	mdadm --zero-superblock "${device}"
+
+	# Wipe the partition table
+	wipefs -a "${device}"
+
+	# Wipe any partition signatures
+	for part in ${device}*; do
+		wipefs -a "${part}"
+	done
+}
+
+raid_rebuild() {
+	local devices=( "$@" )
+
+	cat > /etc/rc.d/rcsysinit.d/S99fix-raid <<EOF
+#!/bin/bash
+
+case "\${1}" in
+	start)
+		if [ -e "/dev/md/ipfire:0" ]; then
+			for device in ${devices[@]}; do
+				mdadm --add "/dev/md/ipfire:0" "\${device}"
+			done
+
+			# Delete this script
+			rm "\${0}"
+		fi
+		;;
+esac
+EOF
+
+	chmod a+x /etc/rc.d/rcsysinit.d/S99fix-raid
+}
+
+main() {
+	local raid="$(find_inactive_raid)"
+
+	# Nothing to do if no RAID device found
+	if [ -z "${raid}" ]; then
+		return 0
+	fi
+
+	echo "Fixing RAID ${raid}..."
+
+	local root="$(find_root)"
+
+	# Finding any devices in this RAID
+	local devices=(
+		$(find_raid_devices "${raid}")
+	)
+
+	# Stop the RAID
+	mdadm --stop "${raid}" &>/dev/null
+
+	# Destroy any useful data on all remaining RAID devices
+	local device
+	for device in ${devices[@]}; do
+		# Skip root
+		[ "${device}" = "${root}" ] && continue
+
+		destroy_everything "${device}"
+	done &>/dev/null
+
+	# Re-add devices to the RAID
+	raid_rebuild "${device}"
+
+	return 0
+}
+
+main "$@" || return $?