p4scripts/p4RemoveUnversioned.py

#!/usr/bin/python
# -*- coding: utf8 -*-
# author              : Brian Ernst
# python_version      : 2.7.6 and 3.4.0
# =================================

# todo: switch to `p4 fstat ...`, and parse the output for clientFile and cache it.
# todo: have a backup feature, make sure files are moved to the recycle bin or a temporary file.
# todo: switch to faster method of calling p4 fstat on an entire directory and parsing it's output
# todo: add option of using send2trash
# todo: buffer output, after exceeding a certain amount print to the output.
# todo: allow logging output besides console output, or redirection altogether

import datetime, inspect, marshal, multiprocessing, optparse, os, re, stat, subprocess, sys, threading, time, traceback

# trying ntpath, need to test on linux
import ntpath


re_remove_comment = re.compile( "#.*$" )
def remove_comment( s ):
    return re.sub( re_remove_comment, "", s )


try: input = raw_input
except: pass

def enum(*sequential, **named):
    enums = dict(zip(sequential, range(len(sequential))), **named)
    return type('Enum', (), enums)

MSG = enum('SHUTDOWN', 'PARSE_DIRECTORY', 'RUN_FUNCTION')

p4_ignore = ".p4ignore"

main_pid = os.getpid( )


#if os.name == 'nt' or sys.platform == 'cygwin'
def basename( path ):
    # TODO: import based on platform
    # https://docs.python.org/2/library/os.path.html
    # posixpath for UNIX-style paths
    # ntpath for Windows paths
    # macpath for old-style MacOS paths
    # os2emxpath for OS/2 EMX paths

    #return os.path.basename( path )
    return ntpath.basename( path )

def normpath( path ):
    return ntpath.normpath( path )

def join( patha, pathb ):
    return ntpath.join( patha, pathb )

def splitdrive( path ):
    return ntpath.splitdrive( path )

def get_ignore_list( path, files_to_ignore ):
    # have to split path and test top directory
    dirs = path.split( os.sep )

    ignore_list = [  ]

    for i, val in enumerate( dirs ):
        path_to_find = os.sep.join( dirs[ : i + 1] )

        if path_to_find in files_to_ignore:
            ignore_list.extend( files_to_ignore[ path_to_find ] )

    return ignore_list

def match_in_ignore_list( path, ignore_list ):
    for r in ignore_list:
        if re.match( r, path ):
            return True
    return False

def call_process( args ):
    return subprocess.call( args.split( ), stdout=subprocess.PIPE, stderr=subprocess.PIPE )

def try_call_process( args, path=None ):
    try:
        subprocess.check_output( args.split( ), shell=False, cwd=path )
        return 0
    except subprocess.CalledProcessError:
        return 1

use_bytearray_str_conversion = type( b"str" ) is not str
def get_str_from_process_stdout( line ):
    if use_bytearray_str_conversion:
        return ''.join( map( chr, line ) )
    else:
        return line

def singular_pulural( val, singular, plural ):
    return singular if val == 1 else plural

def parse_info_from_command( args, value, path = None ):
    """

    :rtype : string
    """
    proc = subprocess.Popen( args.split( ), stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=path )
    for line in proc.stdout:
        line = get_str_from_process_stdout( line )

        if not line.startswith( value ):
            continue
        return line[ len( value ) : ].strip( )
    return None

def get_p4_py_results( args, path = None ):
    results = []
    proc = subprocess.Popen( [ 'p4', '-G' ] + args.split( ), stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=path )
    try:
        while True:
            output = marshal.load( proc.stdout )
            results.append( output )
    except EOFError:
        pass
    finally:
        proc.stdout.close()
    return results

# Keep these in mind if you have issues:
# https://stackoverflow.com/questions/16557908/getting-output-of-a-process-at-runtime
# https://stackoverflow.com/questions/4417546/constantly-print-subprocess-output-while-process-is-running
def get_client_set( path ):
    files = set( [ ] )

    make_drive_upper = True if os.name == 'nt' or sys.platform == 'cygwin' else False

    command = "p4 fstat ..."

    proc = subprocess.Popen( command.split( ), stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=path )
    for line in proc.stdout:
        line = get_str_from_process_stdout( line )

        clientFile_tag = "... clientFile "
        if not line.startswith( clientFile_tag ):
            continue

        local_path = normpath( line[ len( clientFile_tag ) : ].strip( ) )
        if make_drive_upper:
            drive, path = splitdrive( local_path )
            local_path = ''.join( [ drive.upper( ), path ] )

        files.add( local_path )

    proc.wait( )

    for line in proc.stderr:
        if "no such file" in line:
            continue
        raise Exception(line)

    return files

def get_client_root( ):
    """

    :rtype : string
    """
    command = "p4 info"

    proc = subprocess.Popen( command.split( ), stdout=subprocess.PIPE, stderr=subprocess.PIPE )
    for line in proc.stdout:
        line = get_str_from_process_stdout( line )

        clientFile_tag = "Client root: "
        if not line.startswith( clientFile_tag ):
            continue

        local_path = normpath( line[ len( clientFile_tag ) : ].strip( ) )

        return local_path
    return None

class PTable( list ):
    def __init__( self, *args ):
        list.__init__( self, args )
        self.mutex = multiprocessing.Semaphore( )

class PDict( dict ):
    def __init__( self, *args ):
        dict.__init__( self, args )
        self.mutex = multiprocessing.Semaphore( )

# TODO: Create a child thread for triggering autoflush events
class Console( threading.Thread ):
    MSG = enum('WRITE', 'FLUSH', 'SHUTDOWN', 'CLEAR' )

    # auto_flush_time is time in milliseconds since last flush to trigger a flush when writing
    def __init__( self, auto_flush_num = None, auto_flush_time = None ):
        threading.Thread.__init__( self )
        self.buffers = {}
        self.buffer_write_times = {}
        self.running = True
        self.queue = multiprocessing.JoinableQueue( )
        self.auto_flush_num = auto_flush_num if auto_flush_num is not None else -1
        self.auto_flush_time = auto_flush_time * 1000 if auto_flush_time is not None else -1
        self.shutting_down = False

    def write( self, data, pid = None ):
        self.queue.put( ( Console.MSG.WRITE, pid if pid is not None else os.getpid(), data ) )

    def writeflush( self, data, pid = None ):
        pid = pid if pid is not None else os.getpid()
        self.queue.put( ( Console.MSG.WRITE, pid, data ) )
        self.queue.put( ( Console.MSG.FLUSH, pid ) )

    def flush( self, pid = None ):
        self.queue.put( ( Console.MSG.FLUSH, pid if pid is not None else os.getpid() ) )

    def clear( self, pid = None ):
        self.queue.put( ( Console.MSG.CLEAR, pid if pid is not None else os.getpid() ) )

    def __enter__( self ):
        self.start( )
        return self

    def __exit__( self, type, value, tb ):
        self.queue.put( ( Console.MSG.SHUTDOWN, ) )
        self.queue.join( )

    def run( self ):
        while True:
            data = self.queue.get( )
            event = data[0]

            if event == Console.MSG.SHUTDOWN:
                # flush remaining buffers before shutting down
                for ( pid, buffer ) in self.buffers.items( ):
                    for line in buffer:
                        print( line )
                self.buffers.clear( )
                self.buffer_write_times.clear( )
                self.queue.task_done( )

                #print(self.queue.qsize())
                #print(self.queue.empty())
                break

            elif event == Console.MSG.WRITE:
                pid, s = data[ 1 : ]

                if pid not in self.buffers:
                    self.buffers[ pid ] = []
                if pid not in self.buffer_write_times:
                    self.buffer_write_times[ pid ] = datetime.datetime.now( )
                self.buffers[ pid ].append( s )

                if self.auto_flush_num >= 0 and len( self.buffers[ pid ] ) >= self.auto_flush_num:
                    self.flush( pid )
                elif self.auto_flush_time >= 0 and ( datetime.datetime.now( ) - self.buffer_write_times[ pid ] ).microseconds >= self.auto_flush_time:
                    self.flush( pid )
                # TODO: if buffer is not empty and we don't auto flush on write, sleep until a time then auto flush according to auto_flush_time
            elif event == Console.MSG.FLUSH:
                pid = data[ 1 ]
                if pid in self.buffers:
                    for line in self.buffers[ pid ]:
                        print( line )
                    self.buffers.pop( pid, None )
                    self.buffer_write_times[ pid ] = datetime.datetime.now( )
            elif event == Console.MSG.CLEAR:
                pid = data[ 1 ]
                if pid in self.buffers:
                    self.buffers.pop( pid, None )

            self.queue.task_done( )

# class Task( threading.Event ):
    # def __init__( data, cmd = None ):
        # threading.Event.__init__( self )

        # self.cmd = cmd if cmd is None MSG.RUN_FUNCTION
        # self.data = data

    # def isDone( self ):
        # return self.isSet()

    # def join( self ):
        # self.wait( )

class Worker( threading.Thread ):
    def __init__( self, console, queue, files_to_ignore ):
        threading.Thread.__init__( self )

        self.console = console
        self.queue = queue
        self.files_to_ignore = files_to_ignore

    def run( self ):
        while True:
            ( cmd, data ) = self.queue.get( )

            if cmd == MSG.SHUTDOWN:
                self.console.flush( )
                self.queue.task_done( )
                break

            if cmd == MSG.RUN_FUNCTION:
                break

            if cmd != MSG.PARSE_DIRECTORY or data is None:
                self.console.flush( )
                self.queue.task_done( )
                continue

            directory = data

            # add threading stuffs

            self.queue.task_done( )

def main( args ):
    start = time.clock()

    # check requirements
    if call_process( 'p4 -V' ) != 0:
        print( 'Perforce Command-line Client(p4) is required for this script.' )
        sys.exit( 1 )

    #http://docs.python.org/library/optparse.html
    parser = optparse.OptionParser( )

    parser.add_option( "-d", "--dir", dest="directory", help="Desired directory to crawl.", default=None )
    parser.add_option( "-t", "--threads", dest="thread_count", help="Number of threads to crawl your drive and poll p4.", default=100 )
    parser.add_option( "-q", "--quiet", action="store_true", dest="quiet", help="This overrides verbose", default=False )
    parser.add_option( "-v", "--verbose", action="store_true", dest="verbose", default=True )
    parser.add_option( "-i", "--interactive", action="store_true", dest="interactive", default=False )

    ( options, args ) = parser.parse_args( args )

    directory = normpath( options.directory if options.directory is not None else os.getcwd( ) )

    # get user
    print("\nChecking p4 info...")
    result = get_p4_py_results('info')
    if len(result) == 0 or b'userName' not in result[0].keys():
        print("Can't find perforce info, is it even setup?")
        sys.exit(1)
    username = get_str_from_process_stdout(result[0][b'userName'])
    client_host = get_str_from_process_stdout(result[0][b'clientHost'])
    print("|Done.")

    client_root = get_client_root()
    ldirectory = directory.lower()
    workspace_name = None
    if client_root is None or not ldirectory.startswith(client_root.lower()):
        print("\nCurrent directory not in client view, checking other workspaces for user '" + username + "' ...")

        workspace_name = parse_info_from_command('p4 info', 'Client name: ')

        # get user workspaces
        result = get_p4_py_results('workspaces -u ' + username)
        workspaces = []
        for r in result:
            whost = get_str_from_process_stdout(r[b'Host'])
            if whost is not None and len(whost) != 0 and client_host != whost:
                continue
            workspace = {'root': get_str_from_process_stdout(r[b'Root']), 'name': get_str_from_process_stdout(r[b'client'])}
            workspaces.append(workspace)

        del result

        # check current directory against current workspace, see if it matches existing workspaces.
        for w in workspaces:
            wname = w['name']
            wlower = w['root'].lower()
            if ldirectory.startswith(wlower):
                # set current directory, don't forget to revert it back to the existing one
                print("|Setting client view to: " + wname)

                if try_call_process( 'p4 set P4CLIENT=' + wname ):
                    print("|There was a problem trying to set the p4 client view (workspace).")
                    sys.exit(1)
                break
        else:
            print( "|Couldn't find a workspace root that matches the current directory for the current user." )
            sys.exit(1)
        print("|Done.")


    # Files are added from .p4ignore
    # Key is the file root, the value is the table of file regexes for that directory.
    files_to_ignore = PDict()

    processed_file_count = 0
    processed_directory_count = 0

    remove_file_count = 0
    remove_dir_count = 0
    warning_count = 0
    error_count = 0

    with Console( auto_flush_num=20, auto_flush_time=1000 ) as c:
        if not options.quiet:
            c.writeflush( "\nCaching files in depot, this may take a little while..." )

        # TODO: push this off to a thread and walk the directory so we get a headstart.
        files_in_depot = get_client_set( directory )

        c.writeflush( "|Done." )

        # TODO: push a os.walk request off to a thread to build a list of files in the directory; create batch based on directory?

        # TODO: at this point join on both tasks to wait until they're done

        # TODO: kick off file removal, make batches from the files for threads to work on since testing has to be done for each.
        #       need to figure out the best way to do this since the ignore list needs to be properly built for each directory;
        #       will at least need to redo how the ignore lists are handled for efficiencies sake.

        if not options.quiet:
            c.writeflush( "\nChecking " + directory)
        for root, dirs, files in os.walk( directory ):
            ignore_list = get_ignore_list( root, files_to_ignore )

            if not options.quiet:
                c.write( "|Checking " + os.path.relpath( root, directory ) )

            for d in dirs:
                processed_directory_count += 1
                path = join( root, d )
                rel_path = os.path.relpath( path, directory )

                if match_in_ignore_list( path, ignore_list ):
                    # add option of using send2trash
                    if not options.quiet:
                        c.write( "| ignoring " + rel_path )
                    dirs.remove( d )

            for f in files:
                processed_file_count += 1
                path = normpath( join( root, f ) )

                if path not in files_in_depot:
                    if not options.quiet:
                        c.write( "| " + f + " is unversioned, removing it." )
                    try:
                        os.chmod( path, stat.S_IWRITE )
                        os.remove( path )
                        remove_file_count += 1
                    except OSError as ex:
                        c.writeflush( "|  " + type( ex ).__name__ )
                        c.writeflush( "|  " + repr( ex ) )
                        c.writeflush( "|  ^ERROR^" )

                        error_count += 1
        if not options.quiet:
            c.write( "|Done." )

        if not options.quiet:
            c.write( os.linesep + "Removing empty directories...")
        # remove empty directories in reverse order
        for root, dirs, files in os.walk( directory, topdown=False ):
            ignore_list = get_ignore_list( root, files_to_ignore )

            for d in dirs:
                processed_directory_count += 1
                path = os.path.join( root, d )
                rel_path = os.path.relpath( path, directory )

                if match_in_ignore_list( path, ignore_list ):
                    # add option of using send2trash
                    if not options.quiet:
                        c.write( "| ignoring " + rel_path )
                    dirs.remove( d )
                try:
                    os.rmdir(path)
                    remove_dir_count += 1
                    if not options.quiet:
                        c.write( "| " + rel_path + " was removed." )
                except OSError:
                    # Fails on non-empty directory
                    pass
        if not options.quiet:
            c.write( "|Done." )

        # This needs to happen automatically even when an exception happens, when we leave scope.
        if workspace_name is not None:
            c.write("\nReverting back to original client view...")
            # set workspace back to the original one
            if try_call_process( 'p4 set P4CLIENT=' + workspace_name ):
                error_count += 1
                if not options.quiet:
                    c.write("|There was a problem trying to restore the set p4 client view (workspace).")
            else:
                if not options.quiet:
                    c.write("|Reverted client view back to '" + workspace_name + "'.")
            if not options.quiet:
                c.write("|Done.")

        if not options.quiet:
            output = "\nChecked " + str( processed_file_count ) + singular_pulural( processed_file_count, " file, ", " files, " )
            output += str( processed_directory_count ) + singular_pulural( processed_directory_count, " directory", " directories")

            output += "\nRemoved " + str( remove_file_count ) + singular_pulural( remove_file_count, " file, ", " files, " )
            output += str( remove_dir_count ) + singular_pulural( remove_dir_count, " directory", " directories")

            if warning_count > 0:
                output += " w/ " + str( warning_count ) + singular_pulural( warning_count, " warning", " warnings" )
            if error_count > 0:
                output += " w/ " + str( error_count ) + singular_pulural( error_count, " error", " errors" )

            end = time.clock()
            delta = end - start
            output += "\nFinished in " + str(delta) + "s"

            c.write( output )

if __name__ == "__main__":
    try:
        main( sys.argv )
    except:
        print( "Unexpected error!" )
        traceback.print_exc( file = sys.stdout )