#!/usr/bin/perl # ------------------------------------------------------------------------- # $Id: c2html,v 1.3 2001/12/18 19:30:32 castaglia Exp $ # $Source: /cvsroot/pdd/devel-guide/bin/c2html,v $ # ------------------------------------------------------------------------- # ------------------------------------------------------------------------- # COPYRIGHT NOTICE # Copyright (C) 2000 TJ Saunders All Rights Reserved. # # You may distribute under the terms of either the GNU General Public # License or the Artistic License, as specified in the Perl README file, # with the exception that it cannot be placed on a CD-ROM or similar media # for commercial distribution without the prior approval of the author. # # ------------------------------------------------------------------------- use strict; use Carp; use File::Basename qw(basename dirname); use Getopt::Std; my $program = basename($0); my %opts = (); if (scalar(@ARGV) < 1) { print STDOUT "usage: $program -h for further help\n"; exit(1); } # parse the command-line options getopts("Td:k:hl:nrs:t:v", \%opts); my @source_files = @ARGV; my $verbose = 0; $verbose = 1 if defined($opts{'v'}); my $html_dir = "html"; usage() if defined($opts{'h'}); # determine the correct extension for the HTML files, based on # OS type (.htm for Windows, .html otherwise) my $html_file_extension = '.html'; # NOTE: make this more intelligent (eg determine language type from extension # of input files my $language = 'c'; my @comment_chars = ('/*', '*/'); my @source_extensions = ('*.[Cc]', '*.[Hh]'); if ($verbose) { print STDOUT "$program: using file extensions: ", join(" ", @source_extensions), "\n"; } # determine the directories to be skipped, when recursing my @skip_directories = (); if (defined($opts{'s'})) { # split the input string my @directories = split(/,(\s*?)/, $opts{'s'}); # make sure the specified directories are valid. Assume the path is # relative to the current directory, unless it begins with a '/'. foreach my $dir (@directories) { # keep this directory if it's valid if (-d $dir) { push(@skip_directories, $dir) } else { print STDOUT "$program: skip directory $dir does not exist\n"; } } } # define some colors for the keywords my $preprocessor_color = "#ff3333"; my $datatype_color = "#771199"; my $default_color = "blue"; # retrieve the keywords hash that pertains to the current source file(s) my $keywords = get_keywords( language => $language ); my $tags_file = "TAGS"; if (defined($opts{'t'})) { $tags_file = $opts{'t'}; } # make sure that user-specified tags file exists and is readable unless (-e $tags_file) { die "$program: tags file $tags_file does not exist\n"; } unless (-r $tags_file) { die "$program: tags file $tags_file is not readable\n"; } my $tags = get_tags( file => $tags_file ); $html_dir = $opts{'d'} if defined($opts{'d'}); # make sure the user-specified directory exists. If it does, make sure # I can write to it. If it doesn't exist, create it. unless (-d $html_dir) { unless (mkdir $html_dir, oct(644)) { die "$program: unable to create $html_dir: $!\n"; } print STDOUT "$program: creating $html_dir\n" if $verbose; } unless (-w $html_dir) { die "$program: $html_dir is not writeable\n"; } print STDOUT "$program: outputting HTML files to $html_dir\n" if $verbose; # now I know where to output files. Deal with the files in the current # directory first (either via $source_extensions or @source_files), then # determine whether to recurse # if there's anything left from the command-line, assume it's a list of # files to be processed. These "leftover" files will override the # language file extension pattern for the current directory only. my @htmlified_files; if (!$opts{'r'} && !@ARGV) { @source_files = glob(@source_extensions); htmlify_dir( dir => ".", files => \@source_files ); } else { if (@ARGV) { htmlify_dir( dir => ".", files => \@ARGV ); } else { htmlify_dir( dir => "." ); } } # generate the index.html file, to be located in the $html_dir top level # NOTE: future improvements could use the Getopt::Long modules, and allow # for index.html files in each subdirectory (assuming -r is set, of course) # as well as one master index. unless ($opts{'n'}) { print STDOUT "$program: generating index$html_file_extension\n" if $verbose; make_file_index( output_dir => $html_dir ); if ($opts{'T'}) { print STDOUT "$program: generating tagIndex$html_file_extension\n" if $verbose; make_tag_index( output_dir => $html_dir ); } } # done! exit 0; # ---------------------------------------------------------------------- # Functions # ---------------------------------------------------------------------- sub get_dirs { my ($dir) = @_; my @dirs = (); # open a directory handle opendir(DIR, $dir) or die "$program: get_dirs(): unable to open $dir: $!\n"; # read in all the directories in this directory while (my $dir_entry = readdir(DIR)) { if (-d "$dir/$dir_entry") { push(@dirs, $dir_entry); } } closedir(DIR); # strip out the . and .. directories @dirs = grep !/^\.\.?$/, @dirs; # and skip $html_dir, too... @dirs = grep !/$html_dir/, @dirs; return \@dirs; } # --------------------------------------------------------------------- sub get_keywords { my %args = @_; unless (defined($args{'language'})) { die "$program: missing required argument: language\n"; } my $language = $args{'language'}; # open the keyword database contained in this file and read in the list of # words that I should pay attention to # NOTE: future enhancement -- enable grouping of keyword into classes # like variables, operators, data types, etc etc, allowing for a different # HTML handling of each class (like a different color for data types than # language-specific operators, etc) # slurp the entire list of keywords into an array my @keyword_specs = (); while (my $line = ) { # skip any comments next if $line =~ /^#/; # skip blank lines, too next if $line =~ /^\s+$/; chomp($line); push(@keyword_specs, $line); } # setup hash for keywords my %keywords; my $keyword_color = $default_color; my $keep = 0; foreach my $spec (@keyword_specs) { if ($spec =~ /<$language>/) { $keep = 1; next; } next unless $keep; # NOTE: future capability of grouping language keywords, and allowing # for custom coloring of each group, will be done here. # NOTE: grouping in development -- here, hardcoded using the idea # of "context". Each "context" is assigned a color. # NOTE: C has both preprocess "if" and "else", as well as language # "if" and "else". How to handle that? Make handle_keyword_token() # aware of the differences (ie the leading # in a preprocessor # directive). if ($keep) { # break the specification down, keeping track of context if ($spec =~ /(\s+)?/) { $keyword_color = $preprocessor_color; next; } elsif ($spec =~ /(\s+)?/) { $keyword_color = $datatype_color; next; } elsif ($spec =~ /(\s+)?<\/preprocessor>/ || $spec =~ /(\s+)?<\/datatype>/) { $keyword_color = $default_color; next; } if ($spec !~ /(\s+)?<\/$language>/) { # remove whitespace from the specification my $keyword = $spec; $keyword =~ s/\s+//g; $keywords{$keyword} = "$keyword<\/font>"; next; } } if ($spec =~ /<\/$language>/) { # skip the rest last; } } # return the HTML/keyword hash as a reference return \%keywords; } # ---------------------------------------------------------------------- sub get_tags { my %args = @_; unless (defined($args{'file'})) { die "$program: missing required argument: file\n"; } my $tags_file = $args{'file'}; # read in the tags file open(TAGS, "< $tags_file") or die "$program: get_tags(): unable to open $tags_file: $!\n"; chomp(my @tags = ); close(TAGS) or die "$program: get_tags(): unable to close $tags_file: $!\n"; # parse the @tags array, restructuring it into more useable hashes # The idea here is to impart an awareness of the relationships between # source files that ctags doesn't have (it can't...it's made to do its # thing only on _one_ file, not on a source tree). my %tags; foreach my $tag_line (@tags) { my %entry; my $link; # this array depends on the format of the tags file, usually output # from the ctags program... my ($tag_name, $tag_type, $tag_line_number, $tag_source_file, @tag_declaration) = split(/\s+/, $tag_line); # if the tag source name begins with a ./, strip that off (it confuses # things later...) $tag_source_file =~ s/^\.\///; # join the tag declaration into one string my $tag_declaration = join(" ", @tag_declaration); # NOTE: how will this structure handle multiple tags with the same name? # This can arise with functions with file scope, thus allowing for # multiple functions with the same tag name -- duh, like main()? =) # NOTE: Answer -- create a structure whose primary keys are tag names, # and whose secondary keys are source files. The structure this # key combination references will contain the line number at which # to place the HTML anchor (if the source file is the one which defines # that tag), and a link to that source file/anchor to be used for # any other line number/source file combination for that tag. # The tag declaration will be used to determine exactly which tag # source file to use as the anchor reference (eg, the same tag name # might have the same tag type -- "macro" -- but one actually defines # the macro value, and one is just a preprocessor references of that # #define value. # populate the %entry hash. First, check $tags to see if this # particular tag name has been encountered before. If so, then # a decision needs to be made about which tag entry is the canonical # source, and which is the reference to that canonical source. # ignore "variable" tags for now... # NOTE: make this a command-line option next if ($tag_type eq "variable"); # ignore "member" tags, too.. # NOTE: again, make this a command-line option next if ($tag_type eq "member"); # now, determine whether this tag deserves to be _the_ definition # and thus deserving of an anchor key, or if it's just a reference, # based upon its declaration. if ($tag_type eq 'enumerator') { # there really isn't much to the declaration of an enumerator. For # now, just assume that this tag is _the_ definition $entry{'anchor'} = $tag_line_number; $link = "$tag_name<\/a>"; } elsif ($tag_type eq 'function') { # let's see how intelligent ctags is. For now, just assume that this # tag is the definition $entry{'anchor'} = $tag_line_number; $link = "$tag_name<\/a>"; } elsif ($tag_type eq 'macro') { # if the declaration does not begin with a #\s+?define, it's a reference, # else it's the definition. Simple. if ($tag_declaration =~ /^(\s+)?#(\s+)?define/ ) { $entry{'anchor'} = $tag_line_number; $link = "$tag_name<\/a>"; } } elsif ($tag_type eq 'struct') { # let's see how intelligent ctags is. For now, just assume that this # tag is the definition $entry{'anchor'} = $tag_line_number; $link = "$tag_name<\/a>"; } elsif ($tag_type eq 'typedef') { # let's see how intelligent ctags is. For now, just assume that this # tag is the definition $entry{'anchor'} = $tag_line_number; $link = "$tag_name<\/a>"; } else { # should never reach this point, unless member/variable tags are # allowed } # if $link contains a reference to a file in a "skipped" directory, # then rewrite the link so that it is just plain text if (scalar(@skip_directories)) { foreach my $skip_dir (@skip_directories) { # make a copy of the directory name to be skipped, then escape # any metacharacters that can cause trouble... my $dir = $skip_dir; $dir =~ s/([\/\\\+\*])/\\$1/g; if ($tag_source_file =~ /^$dir/) { $link = $tag_name; if ($verbose) { print STDOUT "$program: skipping tag $tag_name for $tag_source_file\n"; } } } } # store the references to this tag information $tags{$tag_name}{$tag_source_file} = \%entry; $tags{$tag_name}{'link'} = $link; # NOTE: future improvement -- once grouping of datatype keywords is # implemented, allow for 'typedefs' to add to that datatype group, # so that a typedef'd object will be treated just like any other # data type. } # return a reference to the tags hash return \%tags; } # ---------------------------------------------------------------------- sub handle_comments { my %args = @_; # NOTE: THIS FUNCTION CURRENTLY UNDER DEVELOPMENT, AND IS NOT IN USE my $string = $args{'string'}; # find comments, and mark them with the appropriate HTML tags # NOTE: the comment-determine routine should also know how to handle # Perl PODs # NOTE: this should also recognize when the comment characters are # themselves in an embedded string, and thus to be ignored... # NOTE: these "in a comment" checks are too simplistic, and thus # are causing problems. my $line; my $comment_count; my $in_a_comment; my $number_of_lines; my $source_file; if ($line =~ /\/\*/) { # put the comments in green $line =~ s/\/\*/\/\*/; $comment_count++; $in_a_comment = $number_of_lines; } # initialize the line of actual code, as opposed to a comment line # NOTE: this next section is a bit messy -- take some time to clean up # the organization and logic flow... my $code_line; if (! $in_a_comment) { $line = htmlify_line( string => $line, file => $source_file, line_number => $number_of_lines ); } elsif ($in_a_comment == $number_of_lines) { # in a line with a comment. This split() call needs to be made # a little more intelligent, or at least the handling of its output, # since it effectively strips the end off comments, which makes a problem # when the comment end is in that bit that gets discarded... my @symbols = split(/\/\*/, $line, 2); # ignore the trailing comments, I'm only interested in the code right # now... $line = htmlify_line( string => $symbols[0], file => $source_file, line_number => $number_of_lines ); # append the rest of the line to the htmlified line... $line .= '/*'; $line .= $symbols[1]; } # find closing comment if ($line =~ /\*\// && $line !~ /"(.*?)?\*\/(.*?)?"/) { # close the comment font-coloring tag $line =~ s/\*\//\*\/<\/font>/; $comment_count--; $in_a_comment = 0; } return $string; } # ---------------------------------------------------------------------- sub handle_keyword_token { my %args = @_; my $string = $args{'string'}; my $token = $args{'token'}; # if the current token is a keyword, colorize it -- unless that token # appears in a string (denoted by "'s), in which case, ignore it. $string = replace_tokens( string => $string, token => $token, html => $keywords->{$token} ); return $string; } # ---------------------------------------------------------------------- sub handle_link_token { my %args = @_; my $string = $args{'string'}; my $token = $args{'token'}; my $source_file = $args{'file'}; my $line_number = $args{'line_number'}; my $link; if (defined($tags->{$token}{$source_file}) && $line_number == $tags->{$token}{$source_file}->{'anchor'}) { # if the current line of code contains a tag, plant an HTML anchor print HTML "<\/a>"; # the "link", in this case, will be just be the token name $link = $token; } else { # get the right URL unless ($link = $tags->{$token}{'link'}) { # this covers the possibility that there is no "link" for the # given token, for that link depends on the ctags output. Some # tags might be defined by system headers, libraries, or by # the compiler, in which case ctags won't know about their definition, # just the reference. Without this clause, which just passes the # token through unaltered, it would be translated to a blank, which # is less than desired behavior. $link = $token; } } # substitute the token in the current line with the corresponding # HTML link. Make sure that links points to the right (relative) # URL. And make sure that all non-alphanumeric characters are # appropriately quoted. my $filename = "$source_file$html_file_extension"; # make a copy of the filename for comparison reasons my $compare_filename = $filename; # make sure that all non-alphanumeric characters are appropriately quoted $compare_filename =~ s/([\/\\\+\*])/\\$1/g; if ($link =~ /$compare_filename/) { # the would-be link points to this source file, so to make it relative, # just remove the filename portion from the URL $link =~ s/$filename//; } elsif ($link =~ /a href=\"(\S+)\.htm(l)?/) { # count the number of /'s in $source_file. This is the # subdirectory depth, measured from $html_dir, of the current file. # If the URL doesn't point to the current file, prepend that # number of ../'s to make a working relative URL my $count = $source_file =~ tr/\//\//; my $new_filename = ('../' x $count) . $1; $link =~ s/$1/$new_filename/; } # make sure, on these substitutions, that $token is not embedded # in some larger word. $string = replace_tokens( string => $string, token => $token, html => $link); return $string; } # ---------------------------------------------------------------------- sub htmlify_dir { my %args = @_; my $current_dir = $args{'dir'}; # create this current directory under the HTML directory tree if # it doesn't exist there, in order to maintain the same file layout # in the HTML directory my $output_dir; unless ($current_dir eq ".") { $output_dir = $html_dir . '/' . $current_dir; } else { $output_dir = $html_dir; } unless (-d $output_dir or $current_dir eq ".") { unless (mkdir $output_dir, oct(777)) { die "$program: htmlify_dir(): unable to create $output_dir: $!\n"; } } my @source_files; unless (defined($args{'files'})) { # first, tack on the current directory to the beginning of each # extension pattern # this local copy of the file extensions is necessary for preserving # the original extensions when prepending subdirectory names... my @current_extensions = @source_extensions; foreach my $pattern (@current_extensions) { unless ($current_dir eq ".") { $pattern = $current_dir . "/" . $pattern; } } @source_files = glob(join(" ", @current_extensions)); } else { @source_files = @{ $args{'files'} }; # and undefine the recurse option while we're at it, since if a list # of files to be processed is passed via the command-line, recursion # will be assumed to not be desired. delete $opts{'r'}; } foreach my $source_file (@source_files) { htmlify_file( file => $source_file, output_dir => $output_dir ); } if ($opts{'r'}) { my @dirs = @{ get_dirs( $current_dir ) }; SUBDIR: foreach my $subdir (@dirs) { unless ($current_dir eq ".") { $subdir = $current_dir . "/" . $subdir; } # if the user specified any directories to skip, make sure to skip # them here if (scalar(@skip_directories)) { foreach my $skip_dir (@skip_directories) { # make a copy of the directory name to be skipped, then escape # any metacharacters that can cause trouble... my $dir = $skip_dir; $dir =~ s/([\/\\\+\*])/\\$1/g; if ($subdir =~ /^$dir/) { print STDOUT "$program: skipping $subdir\n" if $verbose; next SUBDIR; } } } htmlify_dir( dir => $subdir ); } } return 1; } # ---------------------------------------------------------------------- sub htmlify_file { my %args = @_; my $source_file = $args{'file'}; my $output_dir = $args{'output_dir'}; my $comment_count = 0; # open, and read, the current source file open(SOURCE, "< $source_file") or die "$program: htmlify_file(): unable to open $source_file: $!\n"; my @lines = ; close(SOURCE) or die "$program: htmlify_file(): unable to close $source_file: $!\n"; # trim off any trailing slashes in $output_dir if ($output_dir =~ /\/$/) { chop($output_dir); } # open, and prepare, the HTML page for this source file my $html_file; if ($opts{'r'}) { # if recursing, just get the base of the filename. The necessary # directory will already have been created. $html_file = $output_dir . "/" . basename($source_file) . $html_file_extension; } else { # if not recursing, get the directories of the filename, and make # sure they exist under $output_dir. If not, create them. my $directories = dirname($source_file); if ($verbose) { print STDOUT "$program: creating directory $output_dir/$directories\n"; } unless (-d "$output_dir/$directories") { unless (mkdir "$output_dir/$directories", oct(777)) { die "$program: htmlify_file(): unable to create $output_dir/$directories: $!\n"; } } $html_file = $output_dir . "/" . $source_file . $html_file_extension; } open(HTML, "> $html_file") or die "$program: htmlify_file(): unable to open $html_file: $!\n"; print_html_header( title => $source_file ); # use the
 HTML tag for preserving the layout of the source code
  print HTML "
\n";

  # initialize the line number count
  my $number_of_lines = 0;

  # set the "in a comment" flag
  my $in_a_comment = 0;

  foreach my $line (@lines) {

    # keep track of the current line number with this counter
    $number_of_lines++;

    # find and replace ampersands
    $line =~ s/&/&/g;

    # find and replace less thans and greater thans
    $line =~ s//>/g;

    # find comments, and mark them with the appropriate HTML tags

    # NOTE: this comment determination will need to be made into a subroutine
    #  in order to correctly handle the different comment characters of
    #  the different languages

    # NOTE: the comment-determine routine should also know how to handle
    #  Perl PODs

    # NOTE: this should also recognize when the comment characters are
    #  themselves in an embedded string, and thus to be ignored...

    # NOTE: deal with single quotes (') correctly, as in:
    #  while(**cp && (quote_mode ? (**cp != '\"') : !isspace((UCHAR)**cp))) {
    #
    #  which is currently causing trouble.

    if ($line =~ /\/\*/) {

      if ($line =~ /".*?".*?\/\*.*?(")?/ ) {

        # put the comments in green
        $line =~ s/\/\*/\/\*/;
      
        $comment_count++;
        $in_a_comment = $number_of_lines;

      } elsif ($line =~ /".*?\/\*.*?"/) {
        
        # this catches comment characters embedded in real strings,
        # which are to be left alone (ie, a no-op)

      } else {

        # put the comments in green
        $line =~ s/\/\*/\/\*/;

        $comment_count++;
        $in_a_comment = $number_of_lines;
      }
    }

    # initialize the line of actual code, as opposed to a comment line
    # NOTE: this next section is a bit messy -- take some time to clean up
    #  the organization and logic flow...

    my $code_line;

    if (! $in_a_comment) {

      $line = htmlify_line( string => $line, file => $source_file,
        line_number => $number_of_lines );

    } elsif ($in_a_comment == $number_of_lines) {
   
      # in a line with a comment.  This split() call needs to be made
      # a little more intelligent, or at least the handling of its output,
      # since it effectively strips the end off comments, which makes a problem
      # when the comment end is in that bit that gets discarded...
      
      my @symbols = split(/\/\*/, $line, 2);

      # ignore the trailing comments, I'm only interested in the code right
      # now...

      $line = htmlify_line( string => $symbols[0], file => $source_file,
        line_number => $number_of_lines );

      # append the rest of the line to the htmlified line...
      $line .= '/*';
      $line .= $symbols[1];

    }

    # find closing comment
    if ($in_a_comment && $line =~ /\*\// && $line !~ /"(.*?)?\*\/(.*?)?"/) {  

      # close the comment font-coloring tag
      $line =~ s/\*\//\*\/<\/font>/;

      $comment_count--;
      $in_a_comment = 0;
    }

    # prettify the output HTML by wrapping the text?
    print HTML $line;
  }

  print_html_footer();

  if ($verbose && $comment_count) {

    # if there are comments unaccounted for, display the count of them

    print STDOUT
      "$program: $source_file: UnclosedCommentCount = $comment_count\n";
  }

  print STDOUT "$program: htmlified $source_file\n" if $verbose;

  # add this filename to the list of htmlified files
  push(@htmlified_files, $source_file);

  close(HTML) or
    die "$program: htmlify_file(): unable to close $html_file: $!\n";

  return 1;
}

# ----------------------------------------------------------------------
sub htmlify_line {
  my %args = @_;

  my $line = $args{'string'};
  my $source_file = $args{'file'};
  my $line_number = $args{'line_number'};

  # tokenize each word in $line, keeping only the unique tokens

  my @words = split(/\W+/, $line);

  my %seen = ();
  my @tokens = ();

  foreach my $word (@words) {
    push(@tokens, $word) unless $seen{$word}++;
  }

  foreach my $token (@tokens) {

    # if the current token is a keyword, colorize it -- unless that token
    # appears in a string (denoted by "'s), in which case, ignore it.

    if ($keywords->{$token}) {

      # NOTE: UGLY HACK to handle keywords that are used both as preprocessor
      #  directives as well as language words.  This needs to be abstracted,
      #  generalized, as it currently relies as _a priori_ knowledge of these
      #  dual-purpose keywords...

      my $replace_color = 0;
      my $replaced_html;

      if ($line =~ /\s*?#\s*?if/ || $line =~ /\s*?#\s*?else/) {

        # if this occurs, then the keyword should have the preprocessor
        # HTML font-color.  Make sure it does, but keep a copy of
        # whatever HTML tags _are_ set for it, to be restored later.

        $replace_color = 1;

        # get a temporary copy of the previous color tags
        $replaced_html = $keywords->{$token};

        # now, make it the preprocessor color tags
        $keywords->{$token} =
          "$token<\/font>";
      }

      $line = handle_keyword_token( string => $line, token => $token );

      if ($replace_color) {

        # reset the flag
        $replace_color = 0;

        # restore the original HTML font tag to %keywords
        $keywords->{$token} = $replaced_html;       
      }

    } elsif (defined($tags->{$token})) {

      # replace the token with the correct link
      $line = handle_link_token( string => $line, token => $token,
        file => $source_file, line_number => $line_number );
    }
  }

  return $line;
}

# ----------------------------------------------------------------------
sub make_file_index {
  my %args = @_;

  my $html_file = $html_dir . "/index$html_file_extension";

  open(HTML, "> $html_file") or
    die "$program: make_file_index(): unable to open $html_file: $!\n";

  print_html_header( title => "Index of Files" );

  print HTML "
\n"; print HTML "
INDEX OF SOURCE FILES
\n"; print HTML "
"; print HTML "

\n"; # if the tag index was requested, place a link to the generated # function index on this page if (defined($opts{'T'})) { print HTML "
Index of Tags"; print HTML "
\n"; print HTML "
\n"; } print HTML "
    \n"; # loop through @htmlified_files, and print a link for each htmlified file foreach my $file (sort @htmlified_files) { # make sure the pointed-to HTML file exists if (-e "$html_dir/$file$html_file_extension") { print HTML "
  • $file\n"; } } print HTML "

\n"; print_html_footer(); close(HTML) or die "$program: make_file_index(): unable to close $html_file: $!\n"; return 1; } # ---------------------------------------------------------------------- sub make_tag_index { my %args = @_; my $html_file = $html_dir . "/tagIndex$html_file_extension"; open(HTML, "> $html_file") or die "$program: make_tag_index(): unable to open $html_file: $!\n"; print_html_header( title => "Index of Tags" ); print HTML "
INDEX OF TAGS
\n"; print HTML "
"; print HTML "

\n"; print HTML "
\n"; print_html_footer(); close(HTML) or die "$program: make_tag_index(): unable to close $html_file: $!\n"; return 1; } # ---------------------------------------------------------------------- sub print_html_footer { my $timestamp = scalar(localtime(time())); # print the HTML foote, including a timestamp print HTML <
Last Updated: $timestamp

HTML generated by tj's c2html script
END_OF_FOOTER } # ---------------------------------------------------------------------- sub print_html_header { my %args = @_; my $title = $args{'title'}; # print out a header to the current HTML stream print HTML < $title END_OF_HEADER } # ---------------------------------------------------------------------- sub replace_tokens { my %args = @_; my $string = $args{'string'}; my $token = $args{'token'}; my $html = $args{'html'}; # if $string contains any "'s or <>'s, do the token replacement # outside of those areas. Parse $string, and operate only on the # "good" pieces my @chars = split(//, $string); my $in_html_tag = 0; my $in_char_string = 0; my $in_quote_string = 0 ; my @start_positions = (); my @end_positions = (); for (my $index = 0; $index <= $#chars; $index++) { if (!$in_quote_string && $chars[$index] eq '<') { # we're in an HTML tag $in_html_tag = 1; push(@start_positions, $index); } elsif (!$in_quote_string && $chars[$index] eq '>') { # HTML tag is over $in_html_tag = 0; push(@end_positions, $index); } elsif (!$in_html_tag && ($chars[$index] eq "\"" && $chars[$index - 1] ne "\\")) { if ($in_quote_string) { # the embedded string is over $in_quote_string = 0; push(@end_positions, $index); } else { # an embedded string is just starting $in_quote_string = 1; push(@start_positions, $index); } } } # check for the case where only one " was found, so that $in_quote_string # will be true after the end of the loop. If true, pop the last number # off of @start_positions pop(@start_positions) if ($in_quote_string); # NOTE: Eureka! Found the quote bug! It's here. If the above code # detects an embedded string, BUT those embedded strings are part # of HTML tags, then this next block, which actually "colorizes" the # tokens, will never get triggered, because the $quote_count will # be zero. If this is the case, then colorize the string normally. # The bug is as simple as NOT providing an "else" clause for this # check..."Silly programmer, bugs are for morons!" =) if (scalar(@start_positions)) { my $new_string = ""; # usually, there will be only one embedded string. However, that's not # a very safe assumption, so I'm generalizing this... my $offset = 0; for (my $index = 0; $index < scalar(@start_positions); $index++) { # calculate the length of the substrings to be extracted my $good_length = $start_positions[$index] - $offset; my $bad_length = $end_positions[$index] - $start_positions[$index] + 1; # extract the substring on which to operate my $good_string = substr($string, $offset, $good_length); # perform the replacement $good_string = substitute_token( string => $good_string, token => $token, html => $html ); # reset the $offset $offset = $start_positions[$index]; # extract the following "bad" string my $bad_string = substr($string, $offset, $bad_length); # reset $offset again $offset = $end_positions[$index] + 1; # now, tack these pieces onto the new $string $new_string .= $good_string; $new_string .= $bad_string; # NOTE: this loop, currently structured, _assumes_ that $string # ends with a "bad" string, which is not always the case. } # one last substring call, to get any leftover "good" strings my $good_length = length($string) - $offset; if ($good_length) { my $good_string = substr($string, $offset, $good_length); $good_string = substitute_token( string => $good_string, token => $token, html => $html ); $new_string .= $good_string; } # make the old string the new string $string = $new_string; } else { # if there are no embedded strings in $string, the operation is much # simpler... $string = substitute_token( string => $string, token => $token, html => $html ); } return $string; } # ---------------------------------------------------------------------- sub substitute_token { my %args = @_; my $string = $args{'string'}; my $token = $args{'token'}; my $html = $args{'html'}; # NOTE: substitution problem -- three different cases need to # be handled. First, the token appears in the middle of the string, # separated by word boundaries. Second, the token appears at the # very beginning of the string. Third, the token appears at the very # end of the string. And, in all cases, the token does _not_ appear # in comments; if so, it is to be ignored. # using the /g modifier, march through every occurrence of $token in # $string, one by one, and decide on a case-by-case basis whether to # replace that occurrence with the relevant HTML tags. No, this won't # work, since it's not the occurrence of the token itself, but the # immediate context of that occurrence, which determines whether to # do the replacement. # NOTE: perhaps make a local copy of $string, condense the whitespace, # then do the token checking. If there's a leading *, then don't # do the substitution. # the "#include <...>" case, where the <> will be the HTML codes for # those characters... if ($string =~ /<(.+)?$token(.+)?>/) { # no-op return $string; } elsif ($string =~ /\b$token\b/) { # the most common case $string =~ s/\b$token\b/$html/g; } # the "beginning" case if ($string =~ /^$token\b/) { $string =~ s/^$token\b/$html/; } # the "ending" case # NOTE: what about the newline character? if ($string =~ /\b$token\n$/) { $string =~ s/\b$token$/$html/; } return $string; } # ---------------------------------------------------------------------- sub usage { print STDOUT <] [-h] [-l ] [-n] [-r] [-s ] [-t ] [-v] file1 file2 .... fileN -T -- generate an index of all tags (functions, macros, etc) -d -- an output directory to contain the generated HTML pages (default is $html_dir/ in the current directory) -h -- help (this message) -l -- specify a log file, to which verbose output will be sent in addition to STDOUT [NOT IMPLEMENTED] -n -- suppress automatic generation of index$html_file_extension -r -- recurse down through the current directory for source files -s -- specify a list of directories, surrounded by quotes, to skip when recursing -t -- specify an alternate tags file (default is a file called "TAGS" in the current directory) -v -- verbose messages while processing If no files are specified on the command line, the program will use the normal file extensions for C source code files (using globbing) in the current directory to find files on which to operate. To generate a tags file, use the ctags(1) program, like thus: ctags -R -x --sort=no > TAGS Using a ctags file, $program can usually be run like thus: $program -r for recursing through the current directory, which contains C source files, and outputting the generated pages into a new html subdirectory. END_OF_USAGE exit; } # ----------------------------------------------------------------------- __END__ # the language-specific keywords -- each language (and yes, I'm counting # C++ as a separate language than C in this case because of the superset # of special keywords it brings) has its own section denoted by HTML-like # tags # NOTE: future improvement -- allow for groups (eg datatypes, preprocessor) case const default else extern for if register define defined elif endif ifdef ifndef include inline sizeof typedef undef char double float int long short struct union void return static switch unsigned while case include ifdef ifndef else endif define undef void return static unsigned char int struct float double short long for while if else extern const sizeof typedef switch # NOTE: these keyword lists are currently unsupported, but I'll keep them # here, in case I ever expand this program to support them. abstract boolean break byte byvalue case cast catch char class const continue default do double else extends false final finally float for future generic goto if implements import inner instanceof int interface long native new null operator outer package private protected public rest return short static super switch synchronized this throw throws transient true try var void volatile while AUTOLOAD BEGIN continue do else elsif END for foreach goto if last local my next package redo require sub then unless use while