String search in all Databricks notebook in workspace level

Question

Use case : Is there possibility to check all notebooks from one particular Databricks workspace and get an inventory of where notebooks are directly accessing data through ADLS locations rather than through the metastore.

Information to capture:

Notebook names where “abfss://” is referenced

The specific file path(s) referenced in each notebook

Please help me on this if possible.

Alex Ott · Accepted Answer

This functionality isn't available on Azure Databricks - you can search by notebook or folder names only. But you can still do what you want by exporting notebooks to local disk & search for the string. For exporting you can use Databricks CLI's workspace export_dir command, like this:

databricks workspace export_dir '/Shared/' ~/tmp/databricks-files

Karthikeyan Rasipalay Durairaj · Answer

Finally we found a right solutions.Even through, workspace export_dir command is one of the solution for this use case , we have to download all notebooks from workspace to local which is not recommended by our security team . So alternatively , we have achieved through 2.0/workspace/export rest api options in databricks notebooks itself.

Steps :

find out all notebooks with notebooks objects like notebook path, object id
Iterate all notebooks path get the content of those notebooks in encoded format
decoded those notebooks details and check whether respective decoded notebook details are containing the search string.
get the notebook owners based on the object id
Generate the final results .

Reference: https://docs.databricks.com/dev-tools/api/latest/workspace.html#export

Complete code for reference:

from pyspark.sql.types import IntegerType
from pyspark.sql.types import *
from pyspark.sql import Row
import base64
import requests
import json

databricks_instance ="https://databricks_instance.azuredatabricks.net"

url_list = f"{databricks_instance}/api/2.0/workspace/list"
url_export = f"{databricks_instance}/api/2.0/workspace/export"


payload = json.dumps({
  "path": "/Users/"
})
headers = {
  'Authorization': 'Bearer user_token',
  'Content-Type': 'application/json'
}

response = requests.request("GET", url_list, headers=headers, data=payload).json()
notebooks = []

# Getting the all notebooks list for given notebooks.

def list_notebooks(mylist):
  for element in mylist['objects']:
    if element['object_type'] == 'NOTEBOOK':
      notebooks.append(element)
    if element['object_type'] == 'DIRECTORY':
      payload_inner = json.dumps({
        "path": element['path']
      })
      response_inner = requests.request("GET", url_list, headers=headers, data=payload_inner).json()
      if len(response_inner) != 0:
        list_notebooks(response_inner)
  return notebooks

result = list_notebooks(response)
print(result[0])

#     print(current_note_book_id)

#     
class BearerAuth(requests.auth.AuthBase):
      def __init__(self, token):
          self.token = token
      def __call__(self, r):
          r.headers["authorization"] = 'Bearer user_token'
          
          return r
# Define the function to get the permission 

def get_permission(var_object_id):
    url_export1 = f"{databricks_instance}/api/2.0/preview/permissions/notebooks/{var_object_id}"
    response = requests.request("GET", url_export1, headers=headers, data=payload).json()
    access_control_list= response['access_control_list']
    return access_control_list
    
# Define the function to get the user and permission of the notebooks 

def get_control_user_list(access_control_list):
    access_control_user_list =[]
    for var_idx in access_control_list:
        if "user_name" in var_idx:
           var_user_name = var_idx['user_name']
        if "group_name" in var_idx:
           var_user_name = var_idx['group_name']
        var_user_permission = var_idx['all_permissions'][0]['permission_level']
        access_control_user_list.append(var_user_name)
        access_control_user_list.append(var_user_permission)
    return access_control_user_list
  
#get the current notebook path

var_current_notebook_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()

# print(var_current_notebook_path)

# Processing the every elment of the notebook objects for given location 
var_search_type_oracle ='Oracle'


# Defining the search pattern 

var_oracle_search_pattern = 'jdbc:oracle'

#mentioning the target location 
var_report_target_location = '/writing/into/target/location'

# Define the function to formatting the final results

def get_format_result(var_notebook_path,var_search_type,var_access_user_control_list):
    var_access_user_control_list_str = '-'.join(var_access_user_control_list)
    final_format_record = var_notebook_path + "|" + var_search_type + "|" + var_access_user_control_list_str
    return final_format_record

# Opening the file for report 
f = open(f"{var_report_target_location}/mytarget", 'w')

# Processing the every note book objects 

var_total_notebook_count_processed =0
var_total_notebook_count_oracle    =0
var_total_notebook_count_teradata  =0
var_total_notebook_count_datalakeservice  =0
var_total_notebook_count_oauth  =0


for notebook_items in result : 
    var_notebook_path = notebook_items["path"]
    var_object_id     = notebook_items["object_id"]
    var_note_payload  =  json.dumps({"path": f"{var_notebook_path}"})
    var_response = requests.request("GET", url_export, headers=headers, data=var_note_payload).json()
#     if "content" in var_response:
#         var_response_content=var_response['content']
#     else:
#         print ("this is not notbook cotain content = " % (var_notebook_path))
#         var_response_content = base64.b64encode(b'no data to be encoded')
    try:
        var_response_content=var_response['content']
    except:
        # Continue to next iteration as some time note book has some limitation {'error': 'DatabricksServiceException: BAD_REQUEST: content size (15395930) exceeded the limit 10485760'}
        continue
#     var_response_content_str= base64.b64decode(var_response['content']).decode("utf-8") 
    var_response_content_str= base64.b64decode(var_response_content).decode("utf-8") 
    var_total_notebook_count_processed +=1
    if var_response_content_str.find(f"{var_other_onprem_search_pattern}") != -1 and var_response_content_str.find("password") != -1 and var_notebook_path !=var_current_notebook_path :
       # oracle connection password contain note books 
       if var_response_content_str.find(f"{var_oracle_search_pattern}") != -1 and var_response_content_str.find("password") != -1 and var_notebook_path !=var_current_notebook_path :
          print(var_notebook_path)
          var_access_control_list      = get_permission(var_object_id)
          var_access_user_control_list = get_control_user_list(var_access_control_list)
          var_final_format_record      = get_format_result(var_notebook_path,var_search_type_oracle,var_access_user_control_list)
          print(var_final_format_record)
          f.write(var_final_format_record +'
')
          var_total_notebook_count_oracle +=1

String search in all Databricks notebook in workspace level

Tags:

databricks

azure-databricks

Karthikeyan Rasipalay Durairaj

2 Answers

Alex Ott

Karthikeyan Rasipalay Durairaj

Recent Activity

Donate For Us

String search in all Databricks notebook in workspace level

Tags:

databricks

azure-databricks

Karthikeyan Rasipalay Durairaj

2 Answers

Alex Ott

Karthikeyan Rasipalay Durairaj

Related questions

Recent Activity

Donate For Us