package GO::Tool::BiblioTool; use strict; use lib 'go/scratch/tools'; use base 'GO::Tool::GenericGoTool'; ### Various modules required use Data::Dumper; use CGI::Carp qw(fatalsToBrowser); use GO::Object::BiblioEntry; use GO::Object::BiblioSearchEntry; use GO::TestSet qw(dfv_test); use GO::Utilities qw(:all); use GO::Boolean; use GO::GeneralPurposeParser; use URI::Escape; #use Utility::TSLParser; use Time::HiRes qw(gettimeofday tv_interval); my $default_sort_order = [ 'year_asc', 'author', 'title' ]; ### biblio db file my $biblio_db = "/Users/gwg/go/doc/biblio-data.txt"; ### the biblio master, official adder of entries to the GO biblio db ### new biblio will be sent by email to this address for checking my $biblio_master = ""; ### extra data my $extra_data = { display_labels => { 'author' => 'first author', 'year_desc' => 'year, newest first', 'year_asc' => 'year, oldest first', 'year' => 'year', 'category' => 'paper categorization', 'none' => 'don\'t group results', # op => { 'or' => 'any of the words', 'and' => 'all of the words', 'exact' => 'exact phrase', # }, # category => { 1 => 'by GO', # by_go 2 => 'overviews of GO', # go_overview 3 => 'GO annotations', # go_annots 4 => 'genome annotation', # genome_annotation 5 => 'EST annotation', # est_annotation 6 => 'protein annotation',# prot_annotation 7 => 'use of GO in gene expression studies', # gene_expression 8 => 'use of GO in proteomics studies', # proteomics 9 => 'prediction of GO annotations', # go_annotation_prediction 10 => 'GO tools', # go_tool 11 => 'use of GO in biological databases', # bio_db 12 => 'use of GO in data or text mining', # data_mining 13 => 'other applications of GO ', # other_applications 14 => 'publications on biomedical ontologies mentioning GO', # biomed_onts_go_mention 15 => 'publications on OBO ontologies', # obo 16 => 'use of GO in clinical applications', # clinical 17 => 'use of GO in network modeling and analysis', # network_modeling 18 => 'use of GO in comparative genomics and evolutionary analysis', # comp_genomics_evolutionary_analysis 19 => 'use of GO to support predictions', # support_prediction by_go => 'by GO', go_overview => 'overviews of GO', go_annots => 'GO annotations', genome_annotation => 'genome annotation', est_annotation => 'EST annotation', prot_annotation => 'protein annotation', gene_expression => 'use of GO in gene expression studies', proteomics => 'use of GO in proteomics studies', go_annotation_prediction => 'prediction of GO annotations', go_tool => 'GO tools', bio_db => 'use of GO in biological databases', data_mining => 'use of GO in data or text mining', other_applications => 'other applications of GO ', biomed_onts_go_mention => 'publications on biomedical ontologies mentioning GO', obo => 'publications on OBO ontologies', clinical => 'use of GO in clinical applications', network_modeling => 'use of GO in network modeling and analysis', comp_genomics_evolutionary_analysis => 'use of GO in comparative genomics and evolutionary analysis', support_prediction => 'use of GO to support predictions', # }, }, }; sub _specification { my $self = shift; my $form_values = { 'sort' => [ 'year_desc', 'author', 'title', 'year_asc' ], group_by => [ 'year', 'category', 'none' ], op => [ 'or', 'and', 'exact' ], page_size => [ qw( 10 20 50 100 all ) ], }; my @params = ( 'sort', { type => 'array', allow_multiple => 1, # type => 'scalar', test => [ dfv_test('is_in_list_p', { list => $form_values->{'sort'}, this => 1 }), $self->dfv_do_sub(\&check_sort_params, { param => [ 'group_by', 'sort', 'sort2', ] }) ], form_values => $form_values->{'sort'}, default => sub { my $self = shift; my $seen_h; # get the query my $query_h = $self->get_filtered_data; # eliminate the group_by criterion if (!$query_h->{group_by}) { return $default_sort_order->[0]; } (my $var = $query_h->{group_by}) =~ s/_(a|de)sc$//; $seen_h->{$var}++; foreach (@$default_sort_order) { (my $c = $_) =~ s/_(a|de)sc$//; next if $seen_h->{$c}; return $_; } }, }, 'sort2', { type => 'scalar', test => [ dfv_test('is_in_list_p', { list => $form_values->{'sort'}, this => 1 }), $self->dfv_do_sub(\&check_sort_params, { param => [ 'group_by', 'sort', 'sort2', ] }) ], dependencies => [ 'sort' ], form_values => $form_values->{'sort'}, }, 'group_by', { type => 'scalar', test => dfv_test('is_in_list_p', { list => $form_values->{group_by}, this => 1 }), field_filters => sub { my $value = shift; return if $value eq 'none'; return $value; }, default => 'none', form_values => $form_values->{'group_by'}, }, 'query', { search => { type => 'scalar', test => qr/\w{2,}/, # number of consecutive work characters must be greater than 1 required => 1, }, # '*' => { # type => 'scalar', # }, }, 'op', { search => { type => 'scalar', test => dfv_test('is_in_list_p', { list => $form_values->{op}, this => 1 }), default => 'or', form_values => $form_values->{'op'}, }, '*' => { type => 'scalar', form_values => $form_values->{'op'}, default => 'or', }, }, 'page', { type => 'scalar', test => qr/^\d+$/, # must be a number default => '1', }, 'page_size', { type => 'scalar', test => qr/^(\d+|all)$/, # must be a number or all default => '20', form_values => $form_values->{page_size}, }, 'show_comments', { type => 'scalar', test => qr/^(1|true|on)$/, }, 'show_category', { type => 'scalar', test => qr/^(1|true|on)$/, }, 'link_out', { type => 'scalar', test => qr/^(1|true|on)$/, }, 'year', { test => dfv_test('is_number_between_p', { min => 1990, max => 2010, this => 1 }), # must be a valid year # default => undef, # we actually want it to be the first value in the sorted list }, 'category', { test => qr/^\d+$/, # must be a number (currently, anyway) # test => ??? # default => undef, # we actually want it to be the first value in the sorted list }, # journal => { # test => qr/\w{3,}/, # must have more than three consecutive letters # }, 'rm', { test => dfv_test('is_in_list_p', { list => [ qw( query_form browse search ) ], this => 1 }), default => 'query_form', dependencies => { "search" => [ qw( query ) ], } }, ); return @params; } sub _dfv_data { return { filters => [ 'trim', 'strip' ] }; } =head2 Bibliography tool Run modes: - query_form: initial page with the search / browse options on it use CGI values if there are any, or the defaults if not - browse: load the biblio if 'group_by' is set, filter by the group_by crit sort the list get the subset (using page / page_size) if reqd - search: get the search query (query, op) and create the appropriate query object load the biblio perform the search if 'group_by' is set, filter by the group_by crit sort the list get the subset (using page / page_size) if reqd =cut sub setup { my $self = shift; $self->start_mode('query_form'); $self->mode_param(\&set_run_mode); $self->error_mode('die_and_output_error'); $self->run_modes( 'query_form' => 'query_form', 'browse' => 'browse', 'search' => 'search', 'fatal_error' => 'die_and_output_error', 'AUTOLOAD' => 'query_form', ); } sub set_run_mode { my $self = shift; my %cgi_h = $self->query->Vars; if (defined $cgi_h{rm}) { return $cgi_h{rm}; } elsif (exists $cgi_h{query}) { return 'search'; } return 'query_form'; } # configure the template before we do anything else sub cgiapp_init { my $self = shift; $self->tt_config( TEMPLATE_OPTIONS => { INCLUDE_PATH=> [ qw( /Users/gwg/go/scratch/tools/templates/ /Users/gwg/go/www/) ], TRIM=>1, }, ); } ## stuff to do before we run the page-specific code sub cgiapp_prerun { my $self = shift; $self->startme; # check the input query, replace dodgy params with the defaults my $results = $self->check_input_query({ replace_with_defaults => 1, spec_profile => $self->get_current_runmode }); $self->debugme("success status: ".Dumper($results->success)); if (!$results->success) { # if the problem is just with the sort / grouping parameters, ignore it # we can generate the sort list ourselves $self->debugme("results: ".Dumper($results)); my $hash_ref; foreach (@{ $results->missing }, keys %{ $results->invalid }) { $hash_ref->{$_}++; } $self->debugme("Dodgy fields: ".Dumper($hash_ref)); # if there are fields other than the sort, sort2 and group_by fields # that are invalid, we should just go on regardless my $fail; foreach (keys %$hash_ref) { $fail++ && last if $_ ne 'sort' || $_ ne 'sort2' || $_ ne 'group_by'; } # create an error message my $err_str; # if the mode is search and we have problemos with the query, # go back to showing the query form if ($self->get_current_runmode eq 'search' && ! $results->valid('query') ) { if ($results->missing('query')) { $err_str = 'You have not entered a query.'; } elsif ($results->invalid('query')) { $err_str = 'You entered an invalid query.'; } # update the validated query to reflect this my $query_h = $self->param('validated_query'); $query_h->{rm} = 'query_form'; $self->param('validated_query', $query_h); } if ($err_str && $fail) { $self->fatal_msg($err_str); # set the mode to 'query_form' $self->prerun_mode('query_form'); } else { #$self->fatal_msg( join "
", GO::Utilities::summarize_errors($results) ); # this is some other kind of error... return an error page #$self->prerun_mode('fatal_error'); } } # create / update the sort order $self->create_sort_order; $self->debugme("run mode: ".$self->prerun_mode); } sub search { my $self = shift; return $self->retrieve_data({ search => 1 }); } sub browse { my $self = shift; return $self->retrieve_data; } =head2 query_form The 'front page' of the bibliography, with the search, browse and viewing options =cut sub query_form { my $self = shift; $self->startme; # get the other bits and pieces for the template and output it return $self->tt_process("view_biblio.tmpl", $self->set_up_output) || die "Template toolkit messed up, the bastard!: ", $self->tt_obj->error(), "Dying"; } =head2 retrieve_data Function to get data from the DB, either for browsing or by searching. input: $self, optional flag which will trigger a search if present; otherwise, the operation will be browse. - browse: load the biblio if 'group_by' is set, filter by the group_by crit sort the list get the subset (using page / page_size) if reqd - search: load the biblio get the search query (query, op) and create the appropriate query object perform the search if 'group_by' is set, filter by the group_by crit sort the list get the subset (using page / page_size) if reqd =cut sub retrieve_data { my $self = shift; $self->startme; my $options = shift; my $output; my $t1 = [gettimeofday]; my $prepare_record; # get the checked query my $query_h = $self->param('validated_query'); # get the parser ready for action my $parser_args = { file => $biblio_db, parser => 'multi_field_tag_value', return_record_as_object => 'GO::Object::BiblioEntry', check_input => 1, filter_field => qr/^[^!]/, }; # Let's see what we want to search for if ($self->get_current_runmode eq 'search') { # $self->debugme("query_h: ". Dumper($query_h)); # $output->{query_h} = $query_h; # create the search query # Get the query string and the operator my $query = $query_h->{query}; my $op = $query_h->{op}; my @search_strings; # for the time being, just use the first query if (ref $query eq 'ARRAY') { $query = $query->[0]; } # let's have a look at our search params if ($op eq 'exact') { # exact query. Keep all the words together push @search_strings, $query; $op = 'and'; } elsif ($op eq 'or' || $op eq 'and') { @search_strings = split(/\s+/, $query); } # create the query my $query_struct = { uc($op) => [ map { { FN => \&GO::TestSet::test_param, ARGS => ['is_in_string_switched_p', { string => qr/$_/i }], } } @search_strings ], }; $self->debugme("query_struct: ".Dumper($query_struct)); my $search_for; foreach (@search_strings) { push @$search_for, { FN => \&GO::TestSet::test_param, ARGS => [ 'is_in_string_switched_p', { string => qr/$_/i }], }; } # create the query # check for $query_h->{id} appearing as a go_ref_id or alt_id # ignore any record which doesn't feature $query_h->{id} somewhere $parser_args->{filter_record} = $query_struct; # $parser_args->{filter_record} = [ map { qr/(go_ref|alt)_id: $_\b/i } @{$query_h->{id}} ]; } # load the biblio; dies if no biblio is present my $biblio_arr = $self->get_data_from_file($parser_args); $self->param('n_results', scalar @$biblio_arr); if (!$biblio_arr || !@$biblio_arr) { # if there's no data, die. $self->printerr("FATAL: No reference data found in $biblio_db"); die "No valid data could be found. Please check $biblio_db and try again. Dying"; } my $t2 = [gettimeofday]; $self->debugme("Parsing done in ". (tv_interval $t1, $t2)); my $biblio_h; # set the number of the entries in the bibliography $self->param("n_bibliography_entries", scalar @$biblio_arr ); my $t3 = [gettimeofday]; $self->debugme("Searching done in ". (tv_interval $t2, $t3)); # if we have a group_by param, add it to the fray # sort the results into their groups, and then pick the appropriate group to # return as the results if ($query_h->{group_by} && $query_h->{group_by} ne 'none') { # my $grouped_data = $self->group_data_by_param( $biblio_arr, { PARAM => $query_h->{group_by} } ); $self->debugme("grouped data keys: ".Dumper( [ keys %$grouped_data ] )); $self->debugme("None value: ". Dumper( $grouped_data->{NONE} )); # make an index with values and counts my $index; foreach ( sort keys %$grouped_data ) { push @$index, { value => $_, title => $extra_data->{display_labels}{$_} || $_, count => scalar @{$grouped_data->{$_}} }; } $output->{grouped_data_index} = $index; # get the subset of the results that we want # reset biblio_arr to the new result set if ($query_h->{ $query_h->{group_by} }) { #undef $biblio_arr; if (ref($query_h->{ $query_h->{group_by} }) eq 'ARRAY') { # create a query and get the matching items # need to decide whether it's an AND or an OR operation... # create a hash of the entries if one hasn't already been made # (for ease of access/use) if (!$biblio_h) { my $acc = 1; $acc++ and $biblio_h->{$acc} = $_ foreach @$biblio_arr; } my $query = { AND => [ map { { PARAM => $query_h->{group_by}, FN => \&GO::TestSet::test_param, ARGS => [ 'is_the_same_as_p', { string => $_ } ], } } @{$query_h->{$query_h->{group_by}}} ], }; my $result_h = GO::Boolean::run_boolean_query( $query, $biblio_h ); if ($result_h && %$result_h) { $biblio_arr = [ values %$result_h ];# if $result_h && %$result_h; } else { undef $biblio_arr; } } else { # see if this value is present or not # set biblio_arr to that value if it is present $biblio_arr = $grouped_data->{ $query_h->{ $query_h->{group_by} } } || undef; #if $grouped_data->{ $query_h->{ $query_h->{group_by} } }; } # add an error message if no results in that category if (!$biblio_arr) { $self->warning_msg('There are no results with the chosen grouping value.'); } } else { # if we have 'group_by' set, but don't have a value # use the first one in the list $biblio_arr = $grouped_data->{ $index->[0]{value} }; # set this value in query_h $query_h->{ $query_h->{group_by} } = $index->[0]{value}; $self->param('validated_query', $query_h); } } # $self->debugme("\$biblio_arr: ".Dumper($biblio_arr)); if ($biblio_arr) { # work out what our sort order should be # query_h->{default_sort_order} # year_asc - this is fine # year_desc - we want sprintf("%05d", (100000 - $_->year)) my $sort_order = $self->param('sort_order'); # remap the 'year' params my $remapping = { 'year_asc' => 'year', 'year_desc' => { do_fn => { fn => sub { my $number = shift; $number = 10000 - $number; return sprintf("%05d", $number) },#(10000 - shift)) }, obj => { has_fn => 'year' }, }, }, }; foreach (@$sort_order) { $_ = $remapping->{$_} if $remapping->{$_}; } my $sorted_list = GO::Utilities::sort_list({ list => $biblio_arr, crit => $sort_order }); # $self->debugme("sorted_list: ".Dumper($sorted_list)); # get the subset of the results, if appropriate my $paged = get_results_chunk($sorted_list, { chunk_n => $query_h->{page}, chunk_size => $query_h->{page_size} }); $output->{biblio_arr} = $paged->{subset}; $output->{n_pages} = $paged->{n_chunks}; $output->{page} = $paged->{chunk_n}; } # set up all the other stuff and output the template return $self->tt_process("view_biblio.tmpl", $self->set_up_output($output)) || die "Template toolkit messed up, the bastard!: ", $self->tt_obj->error(), "Dying"; } =head2 tt_pre_process { Populates various useful variables from the environment Args: $template_vars # any data to go in the template =cut #sub tt_pre_process { sub set_up_output { my $self = shift; $self->startme; my $tmpl_vars = shift || {}; # my $template_to_use = shift || 'view_biblio'; my $spec = $self->get_spec; # the checked query into $output $tmpl_vars->{query_h} = $self->param('validated_query'); # get the number of entries in the biblio and the number of search results (if # appropriate) $tmpl_vars->{n_bibliography_entries} = $self->param("n_bibliography_entries") if $self->param("n_bibliography_entries"); $tmpl_vars->{n_results} = $self->param("n_results") if $self->param("n_results"); $tmpl_vars = { %$tmpl_vars, base_url => 'http://127.0.0.1/cgi-bin/biblio.cgi', install_dir => 'http://127.0.0.1/go/', }; # if there are errors, add them to the template $tmpl_vars->{message} = $self->get_all_msgs('show_level') if $self->has_msgs; =CUT 'sort' => { 'sort2' => { group_by => { query => { op => { page => { page_size => { show_comments => { show_category => { link_out => { year => { category => { rm => { =cut ### set up URLs for paging / browsing / etc. my $url_h; foreach my $p (keys %$spec) { # if the parameter is set AND there's no default or the parameter differs # from the default, add the param to the URL if ($tmpl_vars->{query_h}{$p} && # no default (!$spec->{$p}{default} || # default, param differs from default ($spec->{$p}{default} && $spec->{$p}{default} ne $tmpl_vars->{query_h}{$p} ) || # more than one value (ref($tmpl_vars->{query_h}{$p}) eq 'ARRAY' && scalar @{$tmpl_vars->{query_h}{$p}} > 1) ) ) { $url_h->{$p} = $tmpl_vars->{query_h}{$p}; } } # paging URLs: need everything except page number delete $url_h->{page} if $url_h->{page}; $tmpl_vars->{url_for_paging} = $self->create_url($url_h); $self->debugme("url for paging: ".Dumper($tmpl_vars->{url_for_paging})); # if we have a grouping parameter set if ($url_h->{group_by} && $url_h->{group_by} ne 'none') { # grouping URLs: don't need page or category/year delete $url_h->{category}; delete $url_h->{year}; $tmpl_vars->{url_for_grouping} = $self->create_url($url_h); $self->debugme("url_for_grouping: ".Dumper($tmpl_vars->{url_for_grouping})); } ### get the bits for the form # misc stuff: what to show, etc., and paging foreach my $x qw(show_category show_comments link_out page) { $tmpl_vars->{$x} = $tmpl_vars->{query_h}{$x} if $tmpl_vars->{query_h}{$x}; } # sort settings and options # how to group results # page size foreach my $var qw( sort sort2 group_by page_size op ) { $tmpl_vars->{$var} = $tmpl_vars->{query_h}{$var}; foreach (@{$spec->{$var}{form_values}}) { my $h = { value => $_ }; $h->{label} = $extra_data->{display_labels}{$var}{$_} || $extra_data->{display_labels}{$_} || $_; $h->{selected} = 1 if $tmpl_vars->{query_h}{$var} eq $_; push @{$tmpl_vars->{ $var.'_options'}}, $h; } } $self->debugme("sort: ".Dumper($tmpl_vars->{'sort_options'})); return $tmpl_vars; } =head2 check_sort_params Ensure that we don't have any duplication in the sort params and the group by params input: sort, sort2 and group_by from the query_h output: returns 1 if all the params mentioned are unique =cut sub check_sort_params { GO::MsgLite->startme; GO::MsgLite->debugme("sort params: ".Dumper(\@_)); my $seen_h; foreach (@_) { my $v = $_; $v = [ $v ] unless ref($v) eq 'ARRAY'; foreach (@$v) { (my $w = $_) =~ s/year.+/year/; return if $seen_h->{$w}; $seen_h->{$w}++; } } return 1; } =head2 create_sort_order Uses the values in the query_h and the defaults to create the sort order Sets the $self param 'sort_order' to be the current sort list input: $self, $arg_list, # optional; array of known sort values $return_pos # optional; which sort variable we want output: variable in sort position =cut sub create_sort_order { my $self = shift; $self->startme; my $return_pos; my $sort_order_list; # set the sort order here my $query_h = $self->param('validated_query'); my $seen_h; my $arg_list; # if arg_list is undefined, set the values from $self->param('validated query') if ($query_h->{group_by}) { (my $gb = $query_h->{group_by}) =~ s/_(de|a)sc$//g; $seen_h->{$gb}++; } # populate the current sort order by going through @$arg_list # keep track of which params we've seen already $self->debugme("seen_h: ".Dumper($seen_h)); foreach my $s qw(sort sort2) { if (defined $query_h->{$s}) { $self->debugme("query_h->{s}: ".Dumper($query_h->{$s})); $query_h->{$s} = [ $query_h->{$s} ] if !ref $query_h->{$s}; foreach my $x (@{$query_h->{$s}}) { push @$arg_list, $x unless grep { $x eq $_ } @$arg_list; } } } # push the default sort order on to the arg_list push @$arg_list, @$default_sort_order; # check that the args are OK and we don't have any duplicates foreach my $a (@$arg_list) { (my $item = $a) =~ s/_(de|a)sc$//g; $self->debugme("item: $item"); # if we've seen it, go on to the next item next if $seen_h->{$item}; push @$sort_order_list, $a; $seen_h->{$item}++; } $self->param('sort_order', $sort_order_list); $self->debugme("sort_order post fn: ".join(", ", @$sort_order_list)); # repopulate sort and sort2 $query_h->{'sort'} = $sort_order_list->[0]; $query_h->{'sort2'} = $sort_order_list->[1]; $self->param('validated_query', $query_h); # if ($return_pos && $return_pos =~ /^[1-3]$/) # { return $sort_order_list->[ $return_pos-- ]; # } } 1;