#!/usr/bin/env python3 import cv2 import numpy as np from scipy.spatial.transform import Rotation import deep_models_shared_python3 as dm class HeadPoseEstimator: def __init__(self, models_directory, use_neural_compute_stick=False): # Load the models models_dir = models_directory print('Using the following directory to load object detector models:', models_dir) # file with network architecture and other information head_detection_model_prototxt_filename = models_dir + '/head_detection/deploy.prototxt' # file with network weights head_detection_model_caffemodel_filename = models_dir + '/head_detection/res10_300x300_ssd_iter_140000.caffemodel' self.face_confidence_threshold = 0.2 print('attempting to load neural network from files') print('prototxt file =', head_detection_model_prototxt_filename) print('caffemodel file =', head_detection_model_caffemodel_filename) self.head_detection_model = cv2.dnn.readNetFromCaffe(head_detection_model_prototxt_filename, head_detection_model_caffemodel_filename) dm.print_model_info(self.head_detection_model, 'head_detection_model') # attempt to use Neural Compute Stick 2 if use_neural_compute_stick: print('HeadPoseEstimator.__init__: Attempting to use an Intel Neural Compute Stick 2 using the following command: self.head_detection_model.setPreferableTarget(cv2.dnn.DNN_TARGET_MYRIAD)') self.head_detection_model.setPreferableTarget(cv2.dnn.DNN_TARGET_MYRIAD) head_pose_model_dir = models_dir + '/open_model_zoo/head-pose-estimation-adas-0001/FP32/' head_pose_weights_filename = head_pose_model_dir + 'head-pose-estimation-adas-0001.bin' head_pose_config_filename = head_pose_model_dir + 'head-pose-estimation-adas-0001.xml' self.head_pose_model = cv2.dnn.readNet(head_pose_weights_filename, head_pose_config_filename) if use_neural_compute_stick: print('Not attempting to use a Intel Neural Compute Stick 2 for head pose estimation due to potential errors.') dm.print_model_info(self.head_pose_model, 'head_pose_model') landmarks_model_dir = models_dir + '/open_model_zoo/facial-landmarks-35-adas-0002/FP32/' landmarks_weights_filename = landmarks_model_dir + 'facial-landmarks-35-adas-0002.bin' landmarks_config_filename = landmarks_model_dir + 'facial-landmarks-35-adas-0002.xml' self.landmarks_model = cv2.dnn.readNet(landmarks_weights_filename, landmarks_config_filename) if use_neural_compute_stick: print('Not attempting to use a Intel Neural Compute Stick 2 for facial landmarks due to potential errors.') dm.print_model_info(self.head_pose_model, 'head_pose_model') dm.print_model_info(self.landmarks_model, 'landmarks_model') self.landmark_names = ['right_eye_left', 'right_eye_right', 'left_eye_right', 'left_eye_left', 'nose_tip', 'nose_bottom', 'nose_right', 'nose_left', 'mouth_right', 'mouth_left', 'mouth_top', 'mouth_bottom', 'right_eyebrow_right', 'right_eyebrow_middle', 'right_eyebrow_left', 'left_eyebrow_right', 'left_eyebrow_middle', 'left_eyebrow_left', 'right_cheek_18', 'right_cheek_19', 'right_cheek_20', 'right_cheek_21', 'right_cheek_22', 'right_cheek_23', 'right_cheek_24', 'chin_right', 'chin_middle', 'chin_left', 'left_cheek_28', 'left_cheek_29', 'left_cheek_30', 'left_cheek_31', 'left_cheek_32', 'left_cheek_33', 'left_cheek_34'] def get_landmark_names(self): return self.landmark_names def get_landmark_colors(self): return None def get_landmark_color_dict(self): return None def detect_faces(self, rgb_image): orig_h, orig_w, c = rgb_image.shape face_image = rgb_image rot_h, rot_w, c = face_image.shape # Assumes that the width is smaller than the height, and crop # a width x width square image from the top. square_face_image = face_image[:rot_w, :, :] sqr_h, sqr_w, c = square_face_image.shape network_image = cv2.resize(square_face_image, (300, 300)) # Some magic numbers came from # https://www.pyimagesearch.com/2018/02/26/face-detection-with-opencv-and-deep-learning/ face_image_blob = cv2.dnn.blobFromImage(network_image, 1.0, (300, 300), (104.0, 177.0, 123.0)) self.head_detection_model.setInput(face_image_blob) face_detections = self.head_detection_model.forward()[0,0,:,:] confidence_mask = face_detections[:, 2] > self.face_confidence_threshold face_detections = face_detections[confidence_mask] coordinates = face_detections[:, 3:7] # Scale and rotate coordinates to the original image coordinates = coordinates * np.array([sqr_w, sqr_h, sqr_w, sqr_h]) face_id = 0 boxes = [] for x0, y0, x1, y1 in coordinates: orig_y0 = y0 orig_y1 = y1 orig_x0 = x0 orig_x1 = x1 face_id += 1 bounding_box = [orig_x0, orig_y0, orig_x1, orig_y1] boxes.append(bounding_box) return boxes def get_sub_image(self, rgb_image, bounding_box, enlarge_box=True, enlarge_scale=1.15): if enlarge_box: scale = enlarge_scale orig_h, orig_w, c = rgb_image.shape x0 = bounding_box[0] y0 = bounding_box[1] x1 = bounding_box[2] y1 = bounding_box[3] m_x = (x1 + x0) / 2.0 m_y = (y1 + y0) / 2.0 b_w = x1 - x0 b_h = y1 - y0 b_w = scale * b_w b_h = scale * b_h x0 = int(round(m_x - (b_w/2.0))) x1 = int(round(m_x + (b_w/2.0))) y0 = int(round(m_y - (b_h/2.0))) y1 = int(round(m_y + (b_h/2.0))) x0 = max(0, x0) x1 = min(orig_w, x1) y0 = max(0, y0) y1 = min(orig_h, y1) else: x0 = int(round(bounding_box[0])) y0 = int(round(bounding_box[1])) x1 = int(round(bounding_box[2])) y1 = int(round(bounding_box[3])) actual_bounding_box = [x0, y0, x1, y1] image_to_crop = rgb_image sub_image = image_to_crop[y0:y1, x0:x1, :] return sub_image, actual_bounding_box def estimate_head_pose(self, rgb_image, bounding_box, enlarge_box=True, enlarge_scale=1.15): face_crop_image, actual_bounding_box = self.get_sub_image(rgb_image, bounding_box, enlarge_box=enlarge_box, enlarge_scale=enlarge_scale) sqr_h, sqr_w, c = face_crop_image.shape if (sqr_h > 0) and (sqr_w > 0): head_pose_image_blob = cv2.dnn.blobFromImage(face_crop_image, size=(60, 60), swapRB=False, crop=False, ddepth=cv2.CV_32F) self.head_pose_model.setInput(head_pose_image_blob) head_pose_out = self.head_pose_model.forward(['angle_r_fc', 'angle_p_fc', 'angle_y_fc']) rpy = head_pose_out roll = rpy[0][0][0] pitch = rpy[1][0][0] yaw = rpy[2][0][0] pitch = pitch * np.pi/180.0 roll = roll * np.pi/180.0 yaw = yaw * np.pi/180.0 return yaw, pitch, roll return None, None, None def detect_facial_landmarks(self, rgb_image, bounding_box, enlarge_box=True, enlarge_scale=1.15): face_crop_image, actual_bounding_box = self.get_sub_image(rgb_image, bounding_box, enlarge_box=enlarge_box, enlarge_scale=enlarge_scale) sqr_h, sqr_w, c = face_crop_image.shape if (sqr_h > 0) and (sqr_w > 0): landmarks_image_blob = cv2.dnn.blobFromImage(face_crop_image, size=(60, 60), swapRB=False, crop=False, ddepth=cv2.CV_32F) self.landmarks_model.setInput(landmarks_image_blob) landmarks_out = self.landmarks_model.forward() s = landmarks_out.shape out = np.reshape(landmarks_out[0], (s[1]//2, 2)) x0, y0, x1, y1 = actual_bounding_box landmarks = {} for n, v in enumerate(out): x = int(round((v[0] * sqr_w) + x0)) y = int(round((v[1] * sqr_h) + y0)) name = self.landmark_names[n] landmarks[name] = (x,y) return landmarks, self.landmark_names.copy() return None, None def draw_bounding_box(self, image, bounding_box): x0 = int(round(bounding_box[0])) y0 = int(round(bounding_box[1])) x1 = int(round(bounding_box[2])) y1 = int(round(bounding_box[3])) color = (0, 0, 255) thickness = 2 cv2.rectangle(image, (x0, y0), (x1, y1), color, thickness) def draw_head_pose(self, image, yaw, pitch, roll, bounding_box): x0, y0, x1, y1 = bounding_box face_x = (x1 + x0) / 2.0 face_y = (y1 + y0) / 2.0 # # opencv uses right-handed coordinate system # x points to the right of the image # y points to the bottom of the image # z points into the image # h, w, c = image.shape camera_center = (w/2.0, h/2.0) #For rendering with an unknown camera focal_length = 50.0 camera_matrix = np.array([[focal_length, 0.0, camera_center[0]], [0.0, focal_length, camera_center[1]], [0.0, 0.0, 1.0]]) face_translation = np.array([0.0, 0.0, 3000.0]) distortion_coefficients = np.array([0.0, 0.0, 0.0, 0.0]) # negate the directions of the y and z axes axes = np.array([[2000.0, 0.0, 0.0 ], [0.0, -2000.0, 0.0 ], [0.0, 0.0, -2000.0], [0.0, 0.0, 0.0 ]]) head_ypr = np.array([-yaw, pitch, roll]) rotation_mat = Rotation.from_euler('yxz', head_ypr).as_dcm() rotation_vec, jacobian = cv2.Rodrigues(rotation_mat) image_points, jacobian = cv2.projectPoints(axes, rotation_vec, face_translation, camera_matrix, distortion_coefficients) face_pix = np.array([face_x, face_y]) origin = image_points[3].ravel() x_axis = (image_points[0].ravel() - origin) + face_pix y_axis = (image_points[1].ravel() - origin) + face_pix z_axis = (image_points[2].ravel() - origin) + face_pix p0 = tuple(np.int32(np.round(face_pix))) p1 = tuple(np.int32(np.round(x_axis))) cv2.line(image, p0, p1, (0, 0, 255), 2) p1 = tuple(np.int32(np.round(y_axis))) cv2.line(image, p0, p1, (0, 255, 0), 2) p1 = tuple(np.int32(np.round(z_axis))) cv2.line(image, p0, p1, (255, 0, 0), 2) def draw_landmarks(self, image, landmarks): for name, xy in landmarks.items(): x = xy[0] y = xy[1] if 'mouth' in name: color = (255, 0, 0) elif 'nose' in name: color = (0, 255, 0) elif 'eyebrow' in name: color = (0, 0, 0) elif 'right_eye' in name: color = (255, 255, 0) elif 'left_eye' in name: color = (0, 255, 255) elif 'chin' in name: color = (255, 0, 255) else: color = (0, 0, 255) cv2.circle(image, (x,y), 2, color, 1) font_scale = 1.0 line_color = [0, 0, 0] line_width = 1 font = cv2.FONT_HERSHEY_PLAIN def apply_to_image(self, rgb_image, draw_output=False): if draw_output: output_image = rgb_image.copy() else: output_image = None heads = [] boxes = self.detect_faces(rgb_image) facial_landmark_names = self.landmark_names.copy() for bounding_box in boxes: if draw_output: self.draw_bounding_box(output_image, bounding_box) yaw, pitch, roll = self.estimate_head_pose(rgb_image, bounding_box, enlarge_box=True, enlarge_scale=1.15) if yaw is not None: ypr = (yaw, pitch, roll) if draw_output: self.draw_head_pose(output_image, yaw, pitch, roll, bounding_box) else: ypr = None landmarks, landmark_names = self.detect_facial_landmarks(rgb_image, bounding_box, enlarge_box=True, enlarge_scale=1.15) if (landmarks is not None) and draw_output: self.draw_landmarks(output_image, landmarks) heads.append({'box':bounding_box, 'ypr':ypr, 'landmarks':landmarks}) return heads, output_image